import numpy as np
l=[1,2,3,4,5]
#convert to array
arr=np.array(l)
type(arr)
numpy.ndarray
np.asarray(l)
array([1, 2, 3, 4, 5])
arr1=np.array([[1,2,3],[2,3,4]])
arr.ndim
1
arr1.ndim
2
mat=np.matrix(l)
mat
matrix([[1, 2, 3, 4, 5]])
a=arr
a
array([1, 2, 3, 4, 5])
arr[0]
1
arr[0]=100
a
array([100, 2, 3, 4, 5])
b=np.copy(arr)
b
array([100, 2, 3, 4, 5])
b[0]=234
b
array([234, 2, 3, 4, 5])
arr
array([100, 2, 3, 4, 5])
list(i*i for i in range(5))
[0, 1, 4, 9, 16]
np.fromstring('23 56 76',sep=' ')
array([23., 56., 76.])
arr.size
5
arr1.size
6
arr.shape
(5,)
import numpy as np
list(range(5))
[0, 1, 2, 3, 4]
np.arange(.4,10.4,0.2)
array([ 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8, 2. , 2.2, 2.4,
2.6, 2.8, 3. , 3.2, 3.4, 3.6, 3.8, 4. , 4.2, 4.4, 4.6,
4.8, 5. , 5.2, 5.4, 5.6, 5.8, 6. , 6.2, 6.4, 6.6, 6.8,
7. , 7.2, 7.4, 7.6, 7.8, 8. , 8.2, 8.4, 8.6, 8.8, 9. ,
9.2, 9.4, 9.6, 9.8, 10. , 10.2])
np.zeros((3,4))
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
np.zeros((3,4,2)) #dimention,row,column
array([[[0., 0.],
[0., 0.],
[0., 0.],
[0., 0.]],
[[0., 0.],
[0., 0.],
[0., 0.],
[0., 0.]],
[[0., 0.],
[0., 0.],
[0., 0.],
[0., 0.]]])
np.ones(5)
array([1., 1., 1., 1., 1.])
np.ones((3,4))
array([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])
np.random.rand(2,3)
array([[0.88744277, 0.31751246, 0.56421228],
[0.27202733, 0.34274517, 0.31403072]])
arr2=np.random.randint(1,5,(3,4))
arr2
array([[3, 4, 4, 2],
[1, 1, 3, 4],
[1, 2, 3, 2]])
arr2.size
12
arr2.reshape(4,3)
array([[3, 4, 4],
[2, 1, 1],
[3, 4, 1],
[2, 3, 2]])
arr2>2
array([[ True, True, True, False],
[False, False, True, True],
[False, False, True, False]])
arr1[0]
array([1, 2, 3])
arr2
array([[3, 4, 4, 2],
[1, 1, 3, 4],
[1, 2, 3, 2]])
arr2[2:4,[2,3]]
array([[3, 2]])
arr1@arr2 #matrix multiplication
array([[ 8, 12, 19, 16],
[13, 19, 29, 24]])
arr3=np.random.randint(1,10,(4,4))
arr3
array([[2, 8, 8, 4],
[8, 4, 5, 2],
[9, 1, 3, 6],
[9, 8, 9, 9]])
arr3.T
array([[2, 8, 9, 9],
[8, 4, 1, 8],
[8, 5, 3, 9],
[4, 2, 6, 9]])
np.repeat(data)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [61], in <cell line: 1>() ----> 1 np.repeat(data) NameError: name 'data' is not defined
data=np.random.randint(3,6,(1,4))
data
array([[4, 3, 4, 3]])
np.repeat(data,4) #it repeat 4 times
array([4, 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3])
np.diag(np.array([1,2,3,4])) #diagonal matrix
array([[1, 0, 0, 0],
[0, 2, 0, 0],
[0, 0, 3, 0],
[0, 0, 0, 4]])
arr1=np.random.randint(1,10,(3,4))
arr2=np.random.randint(1,10,(3,4))
arr1
array([[7, 7, 6, 3],
[3, 5, 7, 5],
[8, 1, 6, 2]])
arr2
array([[8, 2, 9, 7],
[3, 8, 3, 6],
[6, 3, 7, 9]])
arr1>arr2
array([[False, True, False, False],
[False, False, True, False],
[ True, False, False, False]])
arr=np.array(['sudh','kumar'])
arr
array(['sudh', 'kumar'], dtype='<U5')
#convert to upper case
np.char.upper(arr)
array(['SUDH', 'KUMAR'], dtype='<U5')
np.char.capitalize(arr)
array(['Sudh', 'Kumar'], dtype='<U5')
arr1
array([[7, 7, 6, 3],
[3, 5, 7, 5],
[8, 1, 6, 2]])
np.sin(arr1)
array([[ 0.6569866 , 0.6569866 , -0.2794155 , 0.14112001],
[ 0.14112001, -0.95892427, 0.6569866 , -0.95892427],
[ 0.98935825, 0.84147098, -0.2794155 , 0.90929743]])
np.tan(arr1)
array([[ 0.87144798, 0.87144798, -0.29100619, -0.14254654],
[-0.14254654, -3.38051501, 0.87144798, -3.38051501],
[-6.79971146, 1.55740772, -0.29100619, -2.18503986]])
np.exp(arr1) #exponentional
array([[1.09663316e+03, 1.09663316e+03, 4.03428793e+02, 2.00855369e+01],
[2.00855369e+01, 1.48413159e+02, 1.09663316e+03, 1.48413159e+02],
[2.98095799e+03, 2.71828183e+00, 4.03428793e+02, 7.38905610e+00]])
np.mean(arr1)
5.0
np.median(arr1)
5.5
np.std(arr1) #standard division
2.160246899469287
np.var(arr1)
4.666666666666667
np.max(arr1)
8
np.min(arr1)
1
np.multiply(arr1,arr2)
array([[56, 14, 54, 21],
[ 9, 40, 21, 30],
[48, 3, 42, 18]])
np.subtract(arr1,arr2)
array([[-1, 5, -3, -4],
[ 0, -3, 4, -1],
[ 2, -2, -1, -7]])
np.mod(arr1,arr2)
array([[7, 1, 6, 3],
[0, 5, 1, 5],
[2, 1, 6, 2]], dtype=int32)
arr=np.array([2,3,0,9,5,0,6])
np.sort(arr)
array([0, 0, 2, 3, 5, 6, 9])
np.count_nonzero(arr)
5
np.where(arr>0)
(array([0, 1, 3, 4, 6], dtype=int64),)
np.extract(arr>2,arr1)
array([7, 3, 3, 7])
import matplotlib.pyplot as plt
import numpy as np
x=np.random.rand(50)
y=np.random.rand(50)
x
array([0.20390335, 0.06334784, 0.21002773, 0.20997411, 0.81657932,
0.80534733, 0.07292785, 0.46606985, 0.09790799, 0.1854319 ,
0.77042002, 0.19494667, 0.10106707, 0.75044698, 0.33694849,
0.38205424, 0.9060851 , 0.7114721 , 0.39069017, 0.45077087,
0.70085369, 0.45179714, 0.68159053, 0.48475405, 0.90123781,
0.12672423, 0.81312689, 0.12390088, 0.83231107, 0.72636327,
0.75576696, 0.6091184 , 0.4568806 , 0.50639269, 0.62788258,
0.96336562, 0.14599948, 0.34843149, 0.71688237, 0.88023386,
0.68166321, 0.74488392, 0.06832264, 0.64024834, 0.75502204,
0.90163984, 0.47112274, 0.32671817, 0.8732572 , 0.60609986])
y
array([0.67058628, 0.4106971 , 0.20099331, 0.10702518, 0.06435525,
0.12069608, 0.04153432, 0.55397627, 0.56690531, 0.79014101,
0.18397709, 0.22950558, 0.00333029, 0.08779434, 0.64317556,
0.56996028, 0.80141139, 0.64052987, 0.31799524, 0.52277043,
0.54365779, 0.91630439, 0.75594956, 0.36419699, 0.27966255,
0.57334445, 0.90302377, 0.96860598, 0.28965098, 0.84979023,
0.97779768, 0.60484796, 0.88357525, 0.14307741, 0.53958289,
0.66857452, 0.75347336, 0.09605838, 0.163594 , 0.81079885,
0.96818755, 0.73066672, 0.79780539, 0.22988367, 0.69320177,
0.54892592, 0.52539475, 0.69227404, 0.0391454 , 0.82968987])
plt.scatter(x,y) #scatter map
<matplotlib.collections.PathCollection at 0x2263e091700>
plt.figure(figsize=(6,4))
plt.scatter(x,y,c='r')
plt.xlabel("this is x axis")
plt.ylabel("this is y axis")
plt.title("this is x vs y")
plt.grid()
plt.plot(x,y)
[<matplotlib.lines.Line2D at 0x2263fa48520>]
x=np.linspace(1,10,100)
y=np.sin(x)
x
array([ 1. , 1.09090909, 1.18181818, 1.27272727, 1.36363636,
1.45454545, 1.54545455, 1.63636364, 1.72727273, 1.81818182,
1.90909091, 2. , 2.09090909, 2.18181818, 2.27272727,
2.36363636, 2.45454545, 2.54545455, 2.63636364, 2.72727273,
2.81818182, 2.90909091, 3. , 3.09090909, 3.18181818,
3.27272727, 3.36363636, 3.45454545, 3.54545455, 3.63636364,
3.72727273, 3.81818182, 3.90909091, 4. , 4.09090909,
4.18181818, 4.27272727, 4.36363636, 4.45454545, 4.54545455,
4.63636364, 4.72727273, 4.81818182, 4.90909091, 5. ,
5.09090909, 5.18181818, 5.27272727, 5.36363636, 5.45454545,
5.54545455, 5.63636364, 5.72727273, 5.81818182, 5.90909091,
6. , 6.09090909, 6.18181818, 6.27272727, 6.36363636,
6.45454545, 6.54545455, 6.63636364, 6.72727273, 6.81818182,
6.90909091, 7. , 7.09090909, 7.18181818, 7.27272727,
7.36363636, 7.45454545, 7.54545455, 7.63636364, 7.72727273,
7.81818182, 7.90909091, 8. , 8.09090909, 8.18181818,
8.27272727, 8.36363636, 8.45454545, 8.54545455, 8.63636364,
8.72727273, 8.81818182, 8.90909091, 9. , 9.09090909,
9.18181818, 9.27272727, 9.36363636, 9.45454545, 9.54545455,
9.63636364, 9.72727273, 9.81818182, 9.90909091, 10. ])
y
array([ 0.84147098, 0.88704699, 0.92529707, 0.95590534, 0.978619 ,
0.99325047, 0.99967891, 0.99785123, 0.98778253, 0.96955595,
0.94332203, 0.90929743, 0.86776314, 0.8190622 , 0.76359681,
0.70182505, 0.63425707, 0.56145091, 0.48400786, 0.40256749,
0.31780241, 0.23041267, 0.14112001, 0.05066187, -0.04021468,
-0.1307591 , -0.22022362, -0.30786935, -0.39297247, -0.47483011,
-0.55276624, -0.6261372 , -0.69433703, -0.7568025 , -0.8130177 ,
-0.86251837, -0.9048957 , -0.93979971, -0.96694212, -0.98609877,
-0.99711147, -0.99988924, -0.99440916, -0.98071647, -0.95892427,
-0.92921254, -0.89182665, -0.84707537, -0.79532828, -0.73701276,
-0.67261042, -0.60265314, -0.52771868, -0.44842592, -0.36542971,
-0.2794155 , -0.19109366, -0.10119362, -0.01045784, 0.0803643 ,
0.17052273, 0.25927286, 0.34588171, 0.429634 , 0.50983804,
0.58583144, 0.6569866 , 0.72271585, 0.78247636, 0.83577457,
0.88217031, 0.92128041, 0.95278186, 0.9764145 , 0.99198316,
0.99935926, 0.99848187, 0.98935825, 0.97206374, 0.94674118,
0.9135997 , 0.87291301, 0.82501713, 0.77030762, 0.70923631,
0.64230758, 0.57007418, 0.49313267, 0.41211849, 0.32770071,
0.24057653, 0.15146548, 0.06110351, -0.0297631 , -0.1203839 ,
-0.21001048, -0.29790263, -0.38333447, -0.46560043, -0.54402111])
plt.plot(x,y)
[<matplotlib.lines.Line2D at 0x2263fae3910>]
x=['a','b','c','d','e']
y=np.random.rand(5)
x
['a', 'b', 'c', 'd', 'e']
y
array([0.26997769, 0.43015399, 0.93267056, 0.68002515, 0.72440375])
plt.bar(x,y)
plt.xlabel("representing my catagorical")
plt.ylabel("representing my num vakues")
plt.title("bar plot")
plt.figure(figsize=(5,3))
<Figure size 360x216 with 0 Axes>
<Figure size 360x216 with 0 Axes>
data=[1,1,1,5,5,8,7,8,8,9,9,0,4,2,3]
plt.hist(data) #it shows the frequency 1 repeated 3times
(array([1., 3., 1., 1., 1., 2., 0., 1., 3., 2.]), array([0. , 0.9, 1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9. ]), <BarContainer object of 10 artists>)
x=np.random.rand(50)
y=np.random.rand(50)
z=np.random.rand(50)
fig=plt.figure()
ax=fig.add_subplot(projection='3d')
ax.scatter(x,y,z)
plt.show()
import seaborn as sns
iris=sns.load_dataset('iris')
iris
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
sns.scatterplot(x=iris.sepal_length,y=iris.sepal_width)
<AxesSubplot:xlabel='sepal_length', ylabel='sepal_width'>
tips=sns.load_dataset('tips')
tips
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
sns.scatterplot(x=tips.total_bill,y=tips.tip)
<AxesSubplot:xlabel='total_bill', ylabel='tip'>
tips.head()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
tips['smoker'].value_counts()
No 151 Yes 93 Name: smoker, dtype: int64
sns.relplot(x=tips.total_bill,y=tips.tip,data=tips,hue="smoker")
<seaborn.axisgrid.FacetGrid at 0x121d4f01c70>
#how are smoker and not smoker
sns.relplot(x=tips.total_bill,y=tips.tip,data=tips,style="smoker")
<seaborn.axisgrid.FacetGrid at 0x22f50b0cb50>
#
sns.relplot(x=tips.total_bill,y=tips.tip,data=tips,style="size")
<seaborn.axisgrid.FacetGrid at 0x22f517c9e20>
#rhe person came for lunch or dinner
sns.relplot(x=tips.total_bill,y=tips.tip,data=tips,style="size",hue='time')
<seaborn.axisgrid.FacetGrid at 0x22f517c91c0>
#how many people are coming to restarent daily
sns.catplot(x='day',y='total_bill',data=tips)
<seaborn.axisgrid.FacetGrid at 0x22f5186c4f0>
sns.jointplot(x=tips.total_bill,y=tips.tip)
<seaborn.axisgrid.JointGrid at 0x22f5294f040>
sns.pairplot(tips)
<seaborn.axisgrid.PairGrid at 0x22f517ad6a0>
age=[12,13,14,15,21,24]
(12+13+14+15+21+24)/6
16.5
import numpy as np
#mean
np.mean(age)
16.5
#median
np.median(age)
14.5
#mode
from scipy import stats
stats.mode(age)
ModeResult(mode=array([12]), count=array([1]))
ages_lst=[23,24,34,34,23,25,65,75,32]
import numpy as np
mean=np.mean(ages_lst)
mean
37.22222222222222
var=np.var(ages_lst)
var
330.61728395061726
std=np.std(ages_lst)
std
18.182884368290342
data=[[10,12,13],[34,23,65],[32,33,21]]
data
[[10, 12, 13], [34, 23, 65], [32, 33, 21]]
import pandas as pd
df=pd.DataFrame(data,columns=["A","B","C"])
df
| A | B | C | |
|---|---|---|---|
| 0 | 10 | 12 | 13 |
| 1 | 34 | 23 | 65 |
| 2 | 32 | 33 | 21 |
#Row wise
df.var(axis=1)
0 2.333333 1 474.333333 2 44.333333 dtype: float64
#column wise
df.var(axis=0)
A 177.333333 B 110.333333 C 784.000000 dtype: float64
import seaborn as sns
df=sns.load_dataset('healthexp')
df.head()
| Year | Country | Spending_USD | Life_Expectancy | |
|---|---|---|---|---|
| 0 | 1970 | Germany | 252.311 | 70.6 |
| 1 | 1970 | France | 192.143 | 72.2 |
| 2 | 1970 | Great Britain | 123.993 | 71.9 |
| 3 | 1970 | Japan | 150.437 | 72.0 |
| 4 | 1970 | USA | 326.961 | 70.9 |
import numpy as np
df.cov() #covarience
| Year | Spending_USD | Life_Expectancy | |
|---|---|---|---|
| Year | 201.098848 | 2.571883e+04 | 41.915454 |
| Spending_USD | 25718.827373 | 4.817761e+06 | 4166.800912 |
| Life_Expectancy | 41.915454 | 4.166801e+03 | 10.733902 |
#correlation
df.corr(method='spearman')
| Year | Spending_USD | Life_Expectancy | |
|---|---|---|---|
| Year | 1.000000 | 0.931598 | 0.896117 |
| Spending_USD | 0.931598 | 1.000000 | 0.747407 |
| Life_Expectancy | 0.896117 | 0.747407 | 1.000000 |
df=sns.load_dataset('penguins')
df.head()
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
df.corr()
| bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | |
|---|---|---|---|---|
| bill_length_mm | 1.000000 | -0.235053 | 0.656181 | 0.595110 |
| bill_depth_mm | -0.235053 | 1.000000 | -0.583851 | -0.471916 |
| flipper_length_mm | 0.656181 | -0.583851 | 1.000000 | 0.871202 |
| body_mass_g | 0.595110 | -0.471916 | 0.871202 | 1.000000 |
import scipy.stats as stat
import pylab
import numpy as np
import seaborn as sns
df=sns.load_dataset("iris")
df.head()
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
import matplotlib.pyplot as plt
def plot_data(df,feature):
plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
sns.histplot(df[feature],kde=True)
plt.subplot(1,2,2)
stat.probplot(df[feature],dist='norm',plot=pylab)
plt.show()
plot_data(df,'sepal_length')
plot_data(df,'sepal_width')
plot_data(df,'petal_length')
Chi square test
import scipy.stats as stat
import numpy as np
#No of hours student study daily in a weekliy basis
#mond,tus,wend,thus,friday,saturday,sunday
expected_data=[8,6,7,9,6,9,7]
observed_data=[7,8,6,9,9,6,7]
sum(expected_data),sum(observed_data)
(52, 52)
#chi square goodness of fit
chisquare_test_statistics,p_value=stat.chisquare(observed_data,expected_data)
print(chisquare_test_statistics),print(p_value)
3.4345238095238093 0.7526596580922865
(None, None)
#find the critical value
significance=0.05
dof=len(observed_data)-1
critical_value=stat.chi2.ppf(1-significance,dof)
critical_value
12.591587243743977
if chisquare_test_statistics > critical_value:
print("reject the null hypothesis")
else:
print("accept the null hypothesis")
accept the null hypothesis
import seaborn as sns
df=sns.load_dataset('titanic')
df.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
#check missing values
df.isnull()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
| 887 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 888 | False | False | False | True | False | False | False | False | False | False | False | True | False | False | False |
| 889 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 890 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
891 rows × 15 columns
df.isnull().sum() #in which column how many null values are there
survived 0 pclass 0 sex 0 age 177 sibsp 0 parch 0 fare 0 embarked 2 class 0 who 0 adult_male 0 deck 688 embark_town 2 alive 0 alone 0 dtype: int64
#delete the rows or data points to handle missing values
df.shape
(891, 15)
df.dropna().shape #it drop all nan values
(182, 15)
##columns wise delete
df.dropna(axis=1)
| survived | pclass | sex | sibsp | parch | fare | class | who | adult_male | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 1 | 0 | 7.2500 | Third | man | True | no | False |
| 1 | 1 | 1 | female | 1 | 0 | 71.2833 | First | woman | False | yes | False |
| 2 | 1 | 3 | female | 0 | 0 | 7.9250 | Third | woman | False | yes | True |
| 3 | 1 | 1 | female | 1 | 0 | 53.1000 | First | woman | False | yes | False |
| 4 | 0 | 3 | male | 0 | 0 | 8.0500 | Third | man | True | no | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | male | 0 | 0 | 13.0000 | Second | man | True | no | True |
| 887 | 1 | 1 | female | 0 | 0 | 30.0000 | First | woman | False | yes | True |
| 888 | 0 | 3 | female | 1 | 2 | 23.4500 | Third | woman | False | no | False |
| 889 | 1 | 1 | male | 0 | 0 | 30.0000 | First | man | True | yes | True |
| 890 | 0 | 3 | male | 0 | 0 | 7.7500 | Third | man | True | no | True |
891 rows × 11 columns
1-Mean Value Imputation
sns.histplot(df['age'],kde=True)
<AxesSubplot:xlabel='age', ylabel='Count'>
df["age_mean"]=df['age'].fillna(df['age'].mean()) #it replace all the NSAN values with mean of the age
df["age_mean"]=df['age'].fillna(df['age'].mean())
df['age_mean']=df['age'].fillna(df['age'].mean())
df['age_median']=df['age'].fillna(df['age'].median())
df["age_median"]
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
...
886 27.0
887 19.0
888 28.0
889 26.0
890 32.0
Name: age_median, Length: 891, dtype: float64
df[["age_mean",'age']]
| age_mean | age | |
|---|---|---|
| 0 | 22.000000 | 22.0 |
| 1 | 38.000000 | 38.0 |
| 2 | 26.000000 | 26.0 |
| 3 | 35.000000 | 35.0 |
| 4 | 35.000000 | 35.0 |
| ... | ... | ... |
| 886 | 27.000000 | 27.0 |
| 887 | 19.000000 | 19.0 |
| 888 | 29.699118 | NaN |
| 889 | 26.000000 | 26.0 |
| 890 | 32.000000 | 32.0 |
891 rows × 2 columns
df['age_median']=df['age'].fillna(df['age'].median())
df['age_median']=df['age'].fillna(df['age'].median())
df[['age_median','age']]
| age_median | age | |
|---|---|---|
| 0 | 22.0 | 22.0 |
| 1 | 38.0 | 38.0 |
| 2 | 26.0 | 26.0 |
| 3 | 35.0 | 35.0 |
| 4 | 35.0 | 35.0 |
| ... | ... | ... |
| 886 | 27.0 | 27.0 |
| 887 | 19.0 | 19.0 |
| 888 | 28.0 | NaN |
| 889 | 26.0 | 26.0 |
| 890 | 32.0 | 32.0 |
891 rows × 2 columns
df[df['embarked'].isnull()]
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | age_median | age_mean | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 61 | 1 | 1 | female | 38.0 | 0 | 0 | 80.0 | NaN | First | woman | False | B | NaN | yes | True | 38.0 | 38.0 |
| 829 | 1 | 1 | female | 62.0 | 0 | 0 | 80.0 | NaN | First | woman | False | B | NaN | yes | True | 62.0 | 62.0 |
df['embarked'].unique()
array(['S', 'C', 'Q', nan], dtype=object)
mode_values=df[df['embarked'].notna()]['embarked'].mode()[0]
df['median_age']=df['age'].fillna(df['age'].median())
df['embarked'].unique()
array(['S', 'C', 'Q', nan], dtype=object)
df[df['embarked'].notna()]['embarked'].mode()[0]
mode_value=df[df['embarked'].notna()]['embarked'].mode()[0]
df[df['embarked'].notna()]#it don't gives NAN values so we can do mofe
['embarked'].mode()[0] #this is for doing mode
df["embarked_mode"]=df['embarked'].fillna(mode_value)
df[['embarked_mode','embarked']]
| embarked_mode | embarked | |
|---|---|---|
| 0 | S | S |
| 1 | C | C |
| 2 | S | S |
| 3 | S | S |
| 4 | S | S |
| ... | ... | ... |
| 886 | S | S |
| 887 | S | S |
| 888 | S | S |
| 889 | C | C |
| 890 | Q | Q |
891 rows × 2 columns
df['embarked_mode'].isnull().sum()
0
import numpy as np
import pandas as pd
#set the random seed for reproducinility
np.random.seed(123)
#create a datafram with two classes
n_samples=1000
class_0_ratio=0.9
n_class_0=int(n_samples * class_0_ratio)
n_class_1=n_samples - n_class_0
n_class_0,n_class_1
(900, 100)
import numpy as np
x=np.array([1,2,3,4,5,])
y=np.array([2,4,6,8,10])
import matplotlib.pyplot as plt
plt.scatter(x,y)
<matplotlib.collections.PathCollection at 0x29db19f7790>
#interpollate the data using liner interpolation
x_new=np.linspace(1,5,10) #create new x values 1 to 5 ,in between 10 numbers
y_interp=np.interp(x_new,x,y)
plt.scatter(x_new,y_interp)
<matplotlib.collections.PathCollection at 0x29db1b0ce50>
import numpy as np
x=np.array([1,2,3,4,5])
y=np.array([1,8,27,64,125])
from scipy.interpolate import interp1d
#create a cubic interpolation function
f=interp1d(x,y,kind='cubic')
#interpolate the data
x_new=np.linspace(1,5,10)
y_interp=f(x_new)
plt.scatter(x,y)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [6], in <cell line: 1>() ----> 1 plt.scatter(x,y) NameError: name 'plt' is not defined
plt.scatter(x_new,y_interp)
<matplotlib.collections.PathCollection at 0x29db2cf1e20>
#create some ample data
x=np.array([1,2,3,4,5])
y=np.array([1,4,9,16,25])
#interpolate the data using polynomial interpolation
p=np.polyfit(x,y,2)
x_new=np.linspace(1,5,10) #create new x values
y_interp=np.polyval(p,x_new) #interpolate y valies
plt.scatter(x_new,y_interp)
<matplotlib.collections.PathCollection at 0x29db3d2d190>
import seaborn as sns
df=sns.load_dataset('healthexp')
df.head()
| Year | Country | Spending_USD | Life_Expectancy | |
|---|---|---|---|---|
| 0 | 1970 | Germany | 252.311 | 70.6 |
| 1 | 1970 | France | 192.143 | 72.2 |
| 2 | 1970 | Great Britain | 123.993 | 71.9 |
| 3 | 1970 | Japan | 150.437 | 72.0 |
| 4 | 1970 | USA | 326.961 | 70.9 |
#covariance
import numpy as np
df.cov()
| Year | Spending_USD | Life_Expectancy | |
|---|---|---|---|
| Year | 201.098848 | 2.571883e+04 | 41.915454 |
| Spending_USD | 25718.827373 | 4.817761e+06 | 4166.800912 |
| Life_Expectancy | 41.915454 | 4.166801e+03 | 10.733902 |
##Correlation
df.corr(method='spearman')
| Year | Spending_USD | Life_Expectancy | |
|---|---|---|---|
| Year | 1.000000 | 0.931598 | 0.896117 |
| Spending_USD | 0.931598 | 1.000000 | 0.747407 |
| Life_Expectancy | 0.896117 | 0.747407 | 1.000000 |
#pearrson Correlation
df.corr(method='pearson')
| Year | Spending_USD | Life_Expectancy | |
|---|---|---|---|
| Year | 1.000000 | 0.826273 | 0.902175 |
| Spending_USD | 0.826273 | 1.000000 | 0.579430 |
| Life_Expectancy | 0.902175 | 0.579430 | 1.000000 |
import pandas as pd
df=pd.read_csv('winequality-red.csv')
df.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
#summary of the dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
#ddescriptive summary of the dataset
df.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
| mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
| std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
| min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
| 25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
| 50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
| 75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997835 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
df.shape
(1599, 12)
#list down all the column names
df.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
df['quality'].unique()
array([5, 6, 7, 4, 8, 3], dtype=int64)
#missing values in the dataset
df.isnull().sum()
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
#duplicate records
df[df.duplicated()]
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 7.4 | 0.700 | 0.00 | 1.90 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 |
| 11 | 7.5 | 0.500 | 0.36 | 6.10 | 0.071 | 17.0 | 102.0 | 0.99780 | 3.35 | 0.80 | 10.5 | 5 |
| 27 | 7.9 | 0.430 | 0.21 | 1.60 | 0.106 | 10.0 | 37.0 | 0.99660 | 3.17 | 0.91 | 9.5 | 5 |
| 40 | 7.3 | 0.450 | 0.36 | 5.90 | 0.074 | 12.0 | 87.0 | 0.99780 | 3.33 | 0.83 | 10.5 | 5 |
| 65 | 7.2 | 0.725 | 0.05 | 4.65 | 0.086 | 4.0 | 11.0 | 0.99620 | 3.41 | 0.39 | 10.9 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1563 | 7.2 | 0.695 | 0.13 | 2.00 | 0.076 | 12.0 | 20.0 | 0.99546 | 3.29 | 0.54 | 10.1 | 5 |
| 1564 | 7.2 | 0.695 | 0.13 | 2.00 | 0.076 | 12.0 | 20.0 | 0.99546 | 3.29 | 0.54 | 10.1 | 5 |
| 1567 | 7.2 | 0.695 | 0.13 | 2.00 | 0.076 | 12.0 | 20.0 | 0.99546 | 3.29 | 0.54 | 10.1 | 5 |
| 1581 | 6.2 | 0.560 | 0.09 | 1.70 | 0.053 | 24.0 | 32.0 | 0.99402 | 3.54 | 0.60 | 11.3 | 5 |
| 1596 | 6.3 | 0.510 | 0.13 | 2.30 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 6 |
240 rows × 12 columns
##remove the duplicates
df.drop_duplicates(inplace=True) #inplace
df.shape
(1359, 12)
## Minimum,Maximum,Median,Q1,Q3,IQR
import numpy as np
lst_marks=[42,32,56,75,89,54,32,89,90,87,67,54,45,98,99,67,74]
minimum,Q1,median,Q3,maximum=np.quantile(lst_marks,[0,0.25,0.50,0.75,1.0])
minimum,Q1,median,Q3,maximum
(32.0, 54.0, 67.0, 89.0, 99.0)
IQR=Q3-Q1
print(IQR)
35.0
lower_fence=Q1-1.5*(IQR)
higher_fence=Q3+1.5*(IQR)
lower_fence
1.5
higher_fence
141.5
import seaborn as sns
sns.boxplot(lst_marks)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:>
import seaborn as sns
df=sns.load_dataset('tips')
df.head()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
import numpy as np
mean=np.mean(df['total_bill'])
std=np.std(df['total_bill'])
print(mean,std)
19.785942622950824 8.88415057777113
normalized_data=[]
for i in list(df['total_bill']):
z_score=(i-mean)/std
normalized_data.append(z_score)
normalized_data
[-0.3147113050904947, -1.0632353132988699, 0.13777989987156108, 0.4383151031672544, 0.5407447042905058, 0.6195367051545452, -1.239954515236787, 0.7985071071171492, -0.5342033074974618, -0.5634689078183908, -1.071114513385274, 1.7417599174609364, -0.49143050702841173, -0.15262490331304188, -0.5578409077566736, 0.20193910057513573, -1.0643609133112133, -0.39350330595453414, -0.31696250511518154, 0.09725829942719756, -0.21003050394255615, 0.056736698982834455, -0.45203450659639205, 2.2100095225958003, 0.003833498402693769, -0.22241210407833414, -0.7221785095588132, -0.7987193103981659, 0.2154463007232569, -0.01530170180714459, -1.152157714274, -0.16162970341178906, -0.5319521074727749, 0.10176069947657156, -0.22578890411536412, 0.4810879036363043, -0.3912521059298474, -0.3214649051645551, -0.12335930299211276, 1.2926455125359113, -0.4216433062631197, -0.26180810451035363, -0.6580193088552382, -1.1375249141135357, 1.1947183114620337, -0.16838330348584984, 0.27510310137745836, 1.4198383139307178, 0.9864823091785008, -0.1965233037944354, -0.8156033105833173, -1.0688633133605872, 1.6911079169054828, -1.108259313792607, 0.6499279054878175, -0.03331130200463935, 2.0512999208553775, 0.7456039065370086, -0.9619313121879619, 3.2061655335197283, 0.056736698982834455, -0.6726521090157025, -0.9866945124595172, -0.16838330348584984, -0.24717530434988927, 0.0330990987236225, -0.3754937057570394, -1.8815465222725374, 0.049983098908774067, -0.537580107534492, -0.8741345112251752, -0.3057065049917472, 0.7962559070924624, 0.618411105142202, -0.5690969078801078, -1.0441001130890317, -0.21003050394255615, 0.8345263075121387, 0.33475990203165984, -0.280943304720192, -0.03893930206635614, -0.3518561054978275, -1.0936265136321421, 1.451355114276334, -0.4283969063371801, 1.6933591169301692, -0.7604489099784896, -0.16950890349819306, 0.5542519044386269, 0.15466390005671224, 1.0337575096969243, 0.3043687016983871, -1.5798857189645004, -0.39012650591750375, 0.33363430201931626, 2.294429523521557, 0.8435311076108863, -0.8730089112128318, 0.13777989987156108, -0.8246081106820644, -0.9495497120521843, -0.4959329070777853, 2.7593023286193894, 0.2964895016119835, 0.1276494997604703, -0.49818410710247235, 0.0792486992297028, 0.610531905055798, -0.17401130354756705, -0.6163721083985315, -0.6512657087811776, -1.411045717112987, 2.058053520929438, 0.4687063035005267, 0.6668119056729691, -0.2786921046955052, 1.1418151108818928, -1.0283417129162238, -0.8279849107190949, 0.48333910366099114, -0.911279311632508, -0.7165505094970961, -0.6220001084602487, -0.4317737063742105, -0.8223569106573776, 1.1271823107214285, -1.2680945155453727, -0.5927345081393197, -0.946172912015154, 0.34151350210572023, -0.07946090251071965, 0.05448549895814766, -0.9698105122743659, -0.8471201109289329, -0.17176010352287985, -1.269220115557716, -1.0643609133112133, -0.6343817085960263, -0.42614570631249327, -0.745816109818025, -0.26068250449801045, 1.6337023162759678, 2.4092407247805854, 0.8176423073269876, -0.3777449057817262, -1.2872297157552108, -0.12898730305382994, -0.8910185114103265, -1.1262689139901016, -1.3817801167920583, -0.6433865086947737, -0.7491929098550552, -0.284320104757222, 0.5362423042411322, -0.001794501659023419, 1.1328103107831458, 3.194909533396294, 0.5868943047965861, -0.7199273095341262, -0.3709913057076658, 0.19293430047638854, -0.802096110435196, -0.40250810605328136, -0.6726521090157025, -0.2561801044486365, 0.5328655042041018, 0.10963989956297554, 1.342171913079022, -1.0350953129902845, -1.0305929129409106, 3.492067936654957, -0.44753210654701825, -1.411045717112987, 1.35793031325183, -0.33384650530033266, 1.4761183145478889, -0.21340730397958654, -0.5972369081886933, -1.146529714212283, 1.6708471166833012, 1.6730983167079878, 0.3989191027352345, 2.877490329915449, 0.38090950253774014, 2.3372023239906063, 0.10176069947657156, 0.1253982997357831, 1.2014719115360943, -0.1841417036586578, 0.37303030245133617, -0.46103930669513943, 0.0027078983903501713, 0.9741007090427233, -0.48467690695435117, -0.360860905596575, -1.3761521167303412, -1.0632353132988699, 2.6253559271505225, -0.7638257100155198, -0.7064201093860053, -0.12110810296742595, -0.7930913103364486, -0.7638257100155198, -0.3811217058187566, 0.0837510992790768, -0.3732425057323526, 0.76586470675919, 2.1323431217441033, 0.5047255038955163, -0.7908401103117619, 1.1564479110423573, 0.6870727058951507, 3.2129191335937883, -0.7334345096822474, 0.9437095087094509, -0.7750817101389539, 0.9414583086847638, -0.9225353117559422, -1.3558913165081594, 1.1654527111411048, -0.8583761110523671, -0.7165505094970961, -1.261340915471312, -0.4283969063371801, -0.7165505094970961, -0.39575450597922096, -1.0913753136074553, 0.07474629918032921, -0.732308909669904, 0.2627215012416808, 0.47545990357458756, -0.46103930669513943, -0.9202841117312553, -1.0148345127681029, -0.4790489068926342, -1.0936265136321421, -0.8088497105092567, 1.468239114461485, 1.8059191181645113, 1.0405111097709852, 0.8322751074874519, 0.32462950192056905, -0.22128650406599054, -0.11322890288102197]
sns.histplot(df['total_bill'])
<AxesSubplot:xlabel='total_bill', ylabel='Count'>
from sklearn.preprocessing import StandardScaler Z-score formula
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler
StandardScaler()
scaler.fit(df[['total_bill']])#it compute the mean and standard devasion
StandardScaler()
scaler.transform(df[['total_bill']])#scaling down the z-score formula
array([[-3.14711305e-01],
[-1.06323531e+00],
[ 1.37779900e-01],
[ 4.38315103e-01],
[ 5.40744704e-01],
[ 6.19536705e-01],
[-1.23995452e+00],
[ 7.98507107e-01],
[-5.34203307e-01],
[-5.63468908e-01],
[-1.07111451e+00],
[ 1.74175992e+00],
[-4.91430507e-01],
[-1.52624903e-01],
[-5.57840908e-01],
[ 2.01939101e-01],
[-1.06436091e+00],
[-3.93503306e-01],
[-3.16962505e-01],
[ 9.72582994e-02],
[-2.10030504e-01],
[ 5.67366990e-02],
[-4.52034507e-01],
[ 2.21000952e+00],
[ 3.83349840e-03],
[-2.22412104e-01],
[-7.22178510e-01],
[-7.98719310e-01],
[ 2.15446301e-01],
[-1.53017018e-02],
[-1.15215771e+00],
[-1.61629703e-01],
[-5.31952107e-01],
[ 1.01760699e-01],
[-2.25788904e-01],
[ 4.81087904e-01],
[-3.91252106e-01],
[-3.21464905e-01],
[-1.23359303e-01],
[ 1.29264551e+00],
[-4.21643306e-01],
[-2.61808105e-01],
[-6.58019309e-01],
[-1.13752491e+00],
[ 1.19471831e+00],
[-1.68383303e-01],
[ 2.75103101e-01],
[ 1.41983831e+00],
[ 9.86482309e-01],
[-1.96523304e-01],
[-8.15603311e-01],
[-1.06886331e+00],
[ 1.69110792e+00],
[-1.10825931e+00],
[ 6.49927905e-01],
[-3.33113020e-02],
[ 2.05129992e+00],
[ 7.45603907e-01],
[-9.61931312e-01],
[ 3.20616553e+00],
[ 5.67366990e-02],
[-6.72652109e-01],
[-9.86694512e-01],
[-1.68383303e-01],
[-2.47175304e-01],
[ 3.30990987e-02],
[-3.75493706e-01],
[-1.88154652e+00],
[ 4.99830989e-02],
[-5.37580108e-01],
[-8.74134511e-01],
[-3.05706505e-01],
[ 7.96255907e-01],
[ 6.18411105e-01],
[-5.69096908e-01],
[-1.04410011e+00],
[-2.10030504e-01],
[ 8.34526308e-01],
[ 3.34759902e-01],
[-2.80943305e-01],
[-3.89393021e-02],
[-3.51856105e-01],
[-1.09362651e+00],
[ 1.45135511e+00],
[-4.28396906e-01],
[ 1.69335912e+00],
[-7.60448910e-01],
[-1.69508903e-01],
[ 5.54251904e-01],
[ 1.54663900e-01],
[ 1.03375751e+00],
[ 3.04368702e-01],
[-1.57988572e+00],
[-3.90126506e-01],
[ 3.33634302e-01],
[ 2.29442952e+00],
[ 8.43531108e-01],
[-8.73008911e-01],
[ 1.37779900e-01],
[-8.24608111e-01],
[-9.49549712e-01],
[-4.95932907e-01],
[ 2.75930233e+00],
[ 2.96489502e-01],
[ 1.27649500e-01],
[-4.98184107e-01],
[ 7.92486992e-02],
[ 6.10531905e-01],
[-1.74011304e-01],
[-6.16372108e-01],
[-6.51265709e-01],
[-1.41104572e+00],
[ 2.05805352e+00],
[ 4.68706304e-01],
[ 6.66811906e-01],
[-2.78692105e-01],
[ 1.14181511e+00],
[-1.02834171e+00],
[-8.27984911e-01],
[ 4.83339104e-01],
[-9.11279312e-01],
[-7.16550509e-01],
[-6.22000108e-01],
[-4.31773706e-01],
[-8.22356911e-01],
[ 1.12718231e+00],
[-1.26809452e+00],
[-5.92734508e-01],
[-9.46172912e-01],
[ 3.41513502e-01],
[-7.94609025e-02],
[ 5.44854990e-02],
[-9.69810512e-01],
[-8.47120111e-01],
[-1.71760104e-01],
[-1.26922012e+00],
[-1.06436091e+00],
[-6.34381709e-01],
[-4.26145706e-01],
[-7.45816110e-01],
[-2.60682504e-01],
[ 1.63370232e+00],
[ 2.40924072e+00],
[ 8.17642307e-01],
[-3.77744906e-01],
[-1.28722972e+00],
[-1.28987303e-01],
[-8.91018511e-01],
[-1.12626891e+00],
[-1.38178012e+00],
[-6.43386509e-01],
[-7.49192910e-01],
[-2.84320105e-01],
[ 5.36242304e-01],
[-1.79450166e-03],
[ 1.13281031e+00],
[ 3.19490953e+00],
[ 5.86894305e-01],
[-7.19927310e-01],
[-3.70991306e-01],
[ 1.92934300e-01],
[-8.02096110e-01],
[-4.02508106e-01],
[-6.72652109e-01],
[-2.56180104e-01],
[ 5.32865504e-01],
[ 1.09639900e-01],
[ 1.34217191e+00],
[-1.03509531e+00],
[-1.03059291e+00],
[ 3.49206794e+00],
[-4.47532107e-01],
[-1.41104572e+00],
[ 1.35793031e+00],
[-3.33846505e-01],
[ 1.47611831e+00],
[-2.13407304e-01],
[-5.97236908e-01],
[-1.14652971e+00],
[ 1.67084712e+00],
[ 1.67309832e+00],
[ 3.98919103e-01],
[ 2.87749033e+00],
[ 3.80909503e-01],
[ 2.33720232e+00],
[ 1.01760699e-01],
[ 1.25398300e-01],
[ 1.20147191e+00],
[-1.84141704e-01],
[ 3.73030302e-01],
[-4.61039307e-01],
[ 2.70789839e-03],
[ 9.74100709e-01],
[-4.84676907e-01],
[-3.60860906e-01],
[-1.37615212e+00],
[-1.06323531e+00],
[ 2.62535593e+00],
[-7.63825710e-01],
[-7.06420109e-01],
[-1.21108103e-01],
[-7.93091310e-01],
[-7.63825710e-01],
[-3.81121706e-01],
[ 8.37510993e-02],
[-3.73242506e-01],
[ 7.65864707e-01],
[ 2.13234312e+00],
[ 5.04725504e-01],
[-7.90840110e-01],
[ 1.15644791e+00],
[ 6.87072706e-01],
[ 3.21291913e+00],
[-7.33434510e-01],
[ 9.43709509e-01],
[-7.75081710e-01],
[ 9.41458309e-01],
[-9.22535312e-01],
[-1.35589132e+00],
[ 1.16545271e+00],
[-8.58376111e-01],
[-7.16550509e-01],
[-1.26134092e+00],
[-4.28396906e-01],
[-7.16550509e-01],
[-3.95754506e-01],
[-1.09137531e+00],
[ 7.47462992e-02],
[-7.32308910e-01],
[ 2.62721501e-01],
[ 4.75459904e-01],
[-4.61039307e-01],
[-9.20284112e-01],
[-1.01483451e+00],
[-4.79048907e-01],
[-1.09362651e+00],
[-8.08849711e-01],
[ 1.46823911e+00],
[ 1.80591912e+00],
[ 1.04051111e+00],
[ 8.32275107e-01],
[ 3.24629502e-01],
[-2.21286504e-01],
[-1.13228903e-01]])
df.head()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
import pandas as pd
pd.DataFrame(scaler.fit_transform(df[['total_bill','tip']]),columns=['total_bill','tip'])#in one single line
| total_bill | tip | |
|---|---|---|
| 0 | -0.314711 | -1.439947 |
| 1 | -1.063235 | -0.969205 |
| 2 | 0.137780 | 0.363356 |
| 3 | 0.438315 | 0.225754 |
| 4 | 0.540745 | 0.443020 |
| ... | ... | ... |
| 239 | 1.040511 | 2.115963 |
| 240 | 0.832275 | -0.722971 |
| 241 | 0.324630 | -0.722971 |
| 242 | -0.221287 | -0.904026 |
| 243 | -0.113229 | 0.001247 |
244 rows × 2 columns
scaler.transform([[13,4]])
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names warnings.warn(
array([[-0.76382571, 0.72546447]])
xi-xmin/xmax-xmin
df=sns.load_dataset('taxis')
df.head()
| pickup | dropoff | passengers | distance | fare | tip | tolls | total | color | payment | pickup_zone | dropoff_zone | pickup_borough | dropoff_borough | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-03-23 20:21:09 | 2019-03-23 20:27:24 | 1 | 1.60 | 7.0 | 2.15 | 0.0 | 12.95 | yellow | credit card | Lenox Hill West | UN/Turtle Bay South | Manhattan | Manhattan |
| 1 | 2019-03-04 16:11:55 | 2019-03-04 16:19:00 | 1 | 0.79 | 5.0 | 0.00 | 0.0 | 9.30 | yellow | cash | Upper West Side South | Upper West Side South | Manhattan | Manhattan |
| 2 | 2019-03-27 17:53:01 | 2019-03-27 18:00:25 | 1 | 1.37 | 7.5 | 2.36 | 0.0 | 14.16 | yellow | credit card | Alphabet City | West Village | Manhattan | Manhattan |
| 3 | 2019-03-10 01:23:59 | 2019-03-10 01:49:51 | 1 | 7.70 | 27.0 | 6.15 | 0.0 | 36.95 | yellow | credit card | Hudson Sq | Yorkville West | Manhattan | Manhattan |
| 4 | 2019-03-30 13:27:42 | 2019-03-30 13:37:14 | 3 | 2.16 | 9.0 | 1.10 | 0.0 | 13.40 | yellow | credit card | Midtown East | Yorkville West | Manhattan | Manhattan |
from sklearn.preprocessing import MinMaxScaler
min_max=MinMaxScaler() #standarzation
min_max.fit_transform(df[['distance','fare','tip']]) #see it will be in between 0,1
array([[0.04359673, 0.04026846, 0.06475904],
[0.02152589, 0.02684564, 0. ],
[0.0373297 , 0.04362416, 0.07108434],
...,
[0.11280654, 0.10067114, 0. ],
[0.03051771, 0.03355705, 0. ],
[0.10490463, 0.09395973, 0.10120482]])
min_max.fit(df[['distance','fare','tip']])
MinMaxScaler()
min_max.transform([[1.6,7.0,2.15]])
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but MinMaxScaler was fitted with feature names warnings.warn(
array([[0.04359673, 0.04026846, 0.06475904]])
under root of asq2+bsq2
from sklearn.preprocessing import normalize
import pandas as pd
unit_vector=pd.DataFrame(normalize(df[['distance','fare','tip']])) #under root of a square + b square
unit_vector
| 0 | 1 | 2 | |
|---|---|---|---|
| 0 | 0.213461 | 0.933894 | 0.286839 |
| 1 | 0.156064 | 0.987747 | 0.000000 |
| 2 | 0.171657 | 0.939731 | 0.295702 |
| 3 | 0.267899 | 0.939386 | 0.213971 |
| 4 | 0.231742 | 0.965592 | 0.118017 |
| ... | ... | ... | ... |
| 6428 | 0.160133 | 0.960800 | 0.226322 |
| 6429 | 0.307453 | 0.951563 | 0.000000 |
| 6430 | 0.250500 | 0.968117 | 0.000000 |
| 6431 | 0.183497 | 0.983020 | 0.000000 |
| 6432 | 0.242956 | 0.946580 | 0.212034 |
6433 rows × 3 columns
Norminal/OHE Encoding
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
#Create a simple dataframe
df=pd.DataFrame({'color':['red','blue','green','green','red','blue']})
df.head()
| color | |
|---|---|
| 0 | red |
| 1 | blue |
| 2 | green |
| 3 | green |
| 4 | red |
#create an instance of onehotencoder
encoder=OneHotEncoder()
encoded=encoder.fit_transform(df[['color']]).toarray() #this gives binary numbers
import pandas as pd
encoder_df=pd.DataFrame(encoded,columns=encoder.get_feature_names_out())
encoder_df
| color_blue | color_green | color_red | |
|---|---|---|---|
| 0 | 0.0 | 0.0 | 1.0 |
| 1 | 1.0 | 0.0 | 0.0 |
| 2 | 0.0 | 1.0 | 0.0 |
| 3 | 0.0 | 1.0 | 0.0 |
| 4 | 0.0 | 0.0 | 1.0 |
| 5 | 1.0 | 0.0 | 0.0 |
df.head()
| color | |
|---|---|
| 0 | red |
| 1 | blue |
| 2 | green |
| 3 | green |
| 4 | red |
It assigns unique values to each other
from sklearn.preprocessing import LabelEncoder
lbl_encoder=LabelEncoder() #instance
lbl_encoder.fit_transform(df[['color']]) #this gives unique values
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\_label.py:115: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
array([2, 0, 1, 1, 2, 0])
lbl_encoder.transform([['red']]) #afterwards if data may come so use just this to encode a new
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
array([2])
It having instrinsic order or ranking.In this techique,each category is assigned a numerical value based on its position in the order. if e have catergorical variarbles 1.High school:1 2.College:2 3.Graduate:3 4.Post-gradute:4
#ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
df=pd.DataFrame({'size':['small','medium','large','medium','small','large']})
df
| size | |
|---|---|
| 0 | small |
| 1 | medium |
| 2 | large |
| 3 | medium |
| 4 | small |
| 5 | large |
#create as instance of ordinalencoder and then fit_transform
encoder=OrdinalEncoder(categories=[['small','medium','large']])
encoder.fit_transform(df[['size']])
array([[0.],
[1.],
[2.],
[1.],
[0.],
[2.]])
encoder.transform([['small']])
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but OrdinalEncoder was fitted with feature names warnings.warn(
array([[0.]])
It is technique used to encode based on the relationship with the target variable.This encoding technique is useful when have a categorical variable with a large number of unique categories We can replace category in the categorical variable witha numerical value based on the mean or median of the target variable for that category
#create a simple dataset
import pandas as pd
df=pd.DataFrame({'city':['New York','London','Paris','Tokyo','New York','Paris'],
'price':[200,150,300,250,180,320]})
df
| city | price | |
|---|---|---|
| 0 | New York | 200 |
| 1 | London | 150 |
| 2 | Paris | 300 |
| 3 | Tokyo | 250 |
| 4 | New York | 180 |
| 5 | Paris | 320 |
mean_price=df.groupby('city')['price'].mean().to_dict()
mean_price
mean_price
{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}
df['city_encoded']=df['city'].map(mean_price)
df
| city | price | city_encoded | |
|---|---|---|---|
| 0 | New York | 200 | 190.0 |
| 1 | London | 150 | 150.0 |
| 2 | Paris | 300 | 310.0 |
| 3 | Tokyo | 250 | 250.0 |
| 4 | New York | 180 | 190.0 |
| 5 | Paris | 320 | 310.0 |
import seaborn as sns
df=sns.load_dataset('tips')
df.head()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
mean_totalbill=df.groupby('time')['total_bill'].mean().to_dict()
mean_totalbill
{'Lunch': 17.168676470588235, 'Dinner': 20.79715909090909}
df['encoded_time']=df['time'].map(mean_totalbill)
df
| total_bill | tip | sex | smoker | day | time | size | encoded_time | |
|---|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 20.797159 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 20.797159 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 20.797159 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 20.797159 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 20.797159 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 | 20.797159 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 | 20.797159 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 | 20.797159 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 | 20.797159 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 20.797159 |
244 rows × 8 columns
import pandas as pd
df=pd.read_csv("winequality-red.csv")
df.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
#Summary of the dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
#descriptive summary of the dataset
df.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
| mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
| std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
| min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
| 25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
| 50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
| 75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997835 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
df.shape
(1599, 12)
df.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
df['quality'].unique()
array([5, 6, 7, 4, 8, 3], dtype=int64)
#missing values in he dataset
df.isnull().sum()
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
df.quality.value_counts().plot(kind='bar')
plt.xlabel('Wine Quality')
plt.ylabel("Count")
<AxesSubplot:>
df.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
for column in df.columns:
sns.histplot(df[column],kde=True)
sns.histplot(df['alcohol'])
<AxesSubplot:xlabel='alcohol', ylabel='Count'>
#univariate,bivariate,multivariate analysis
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x173ca389730>
sns.catplot(x='quality',y='alcohol',data=df,kind='box')
<seaborn.axisgrid.FacetGrid at 0x173d0633880>
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv('stud.csv')
df.head()
| gender | race_ethnicity | parental_level_of_education | lunch | test_preparation_course | math_score | reading_score | writing_score | |
|---|---|---|---|---|---|---|---|---|
| 0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
| 1 | female | group C | some college | standard | completed | 69 | 90 | 88 |
| 2 | female | group B | master's degree | standard | none | 90 | 95 | 93 |
| 3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 |
| 4 | male | group C | some college | standard | none | 76 | 78 | 75 |
##Checking missing values
df.isnull().sum()
gender 0 race_ethnicity 0 parental_level_of_education 0 lunch 0 test_preparation_course 0 math_score 0 reading_score 0 writing_score 0 dtype: int64
df.isna().sum()
gender 0 race_ethnicity 0 parental_level_of_education 0 lunch 0 test_preparation_course 0 math_score 0 reading_score 0 writing_score 0 dtype: int64
##Check Duplicates
df.duplicated().sum()
0
#check datatypes
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 1000 non-null object 1 race_ethnicity 1000 non-null object 2 parental_level_of_education 1000 non-null object 3 lunch 1000 non-null object 4 test_preparation_course 1000 non-null object 5 math_score 1000 non-null int64 6 reading_score 1000 non-null int64 7 writing_score 1000 non-null int64 dtypes: int64(3), object(5) memory usage: 62.6+ KB
##3.1 Checking the number of uniques,values of each columns
df.nunique()
gender 2 race_ethnicity 5 parental_level_of_education 6 lunch 2 test_preparation_course 2 math_score 81 reading_score 72 writing_score 77 dtype: int64
#check the statistices of the datset
df.describe()
| math_score | reading_score | writing_score | |
|---|---|---|---|
| count | 1000.00000 | 1000.000000 | 1000.000000 |
| mean | 66.08900 | 69.169000 | 68.054000 |
| std | 15.16308 | 14.600192 | 15.195657 |
| min | 0.00000 | 17.000000 | 10.000000 |
| 25% | 57.00000 | 59.000000 | 57.750000 |
| 50% | 66.00000 | 70.000000 | 69.000000 |
| 75% | 77.00000 | 79.000000 | 79.000000 |
| max | 100.00000 | 100.000000 | 100.000000 |
#Explore more info about the data
df.head()
| gender | race_ethnicity | parental_level_of_education | lunch | test_preparation_course | math_score | reading_score | writing_score | |
|---|---|---|---|---|---|---|---|---|
| 0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
| 1 | female | group C | some college | standard | completed | 69 | 90 | 88 |
| 2 | female | group B | master's degree | standard | none | 90 | 95 | 93 |
| 3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 |
| 4 | male | group C | some college | standard | none | 76 | 78 | 75 |
#segrregate numerical and categorical features
[feature for feature in df.columns]# it gives all columns
['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course', 'math_score', 'reading_score', 'writing_score']
numerical_feature=[feature for feature in df.columns if df[feature].dtype!='O']#it gives all int columns
categorical_feature=[feature for feature in df.columns if df[feature].dtype=='O']
numerical_feature
['math_score', 'reading_score', 'writing_score']
categorical_feature
['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
#Aggregate the total score with mean
df['total_score']=(df['math_score']+df['reading_score']+df['writing_score'])
df['average']=df['total_score']/3
df.head()
| gender | race_ethnicity | parental_level_of_education | lunch | test_preparation_course | math_score | reading_score | writing_score | total_score | average | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 | 218 | 72.666667 |
| 1 | female | group C | some college | standard | completed | 69 | 90 | 88 | 247 | 82.333333 |
| 2 | female | group B | master's degree | standard | none | 90 | 95 | 93 | 278 | 92.666667 |
| 3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 | 148 | 49.333333 |
| 4 | male | group C | some college | standard | none | 76 | 78 | 75 | 229 | 76.333333 |
#Explore More Visualization
fig,axis=plt.subplots(1,2,figsize=(15,7))
plt.subplot(121)
sns.histplot(data=df,x='average',bins=30,kde=True,color='g')
plt.subplot(122)
sns.histplot(data=df,x='average',bins=30,kde=True,hue='gender')
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sys
reload(sys)
sys.setdefaultencoding('utf8')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [147], in <cell line: 3>() 1 import sys ----> 3 reload(sys) 4 sys.setdefaultencoding('utf8') NameError: name 'reload' is not defined
df=pd.read_csv('zomato.csv',encoding='latin-1')
df.head()
| Restaurant ID | Restaurant Name | Country Code | City | Address | Locality | Locality Verbose | Longitude | Latitude | Cuisines | ... | Currency | Has Table booking | Has Online delivery | Is delivering now | Switch to order menu | Price range | Aggregate rating | Rating color | Rating text | Votes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6317637 | Le Petit Souffle | 162 | Makati City | Third Floor, Century City Mall, Kalayaan Avenu... | Century City Mall, Poblacion, Makati City | Century City Mall, Poblacion, Makati City, Mak... | 121.027535 | 14.565443 | French, Japanese, Desserts | ... | Botswana Pula(P) | Yes | No | No | No | 3 | 4.8 | Dark Green | Excellent | 314 |
| 1 | 6304287 | Izakaya Kikufuji | 162 | Makati City | Little Tokyo, 2277 Chino Roces Avenue, Legaspi... | Little Tokyo, Legaspi Village, Makati City | Little Tokyo, Legaspi Village, Makati City, Ma... | 121.014101 | 14.553708 | Japanese | ... | Botswana Pula(P) | Yes | No | No | No | 3 | 4.5 | Dark Green | Excellent | 591 |
| 2 | 6300002 | Heat - Edsa Shangri-La | 162 | Mandaluyong City | Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal... | Edsa Shangri-La, Ortigas, Mandaluyong City | Edsa Shangri-La, Ortigas, Mandaluyong City, Ma... | 121.056831 | 14.581404 | Seafood, Asian, Filipino, Indian | ... | Botswana Pula(P) | Yes | No | No | No | 4 | 4.4 | Green | Very Good | 270 |
| 3 | 6318506 | Ooma | 162 | Mandaluyong City | Third Floor, Mega Fashion Hall, SM Megamall, O... | SM Megamall, Ortigas, Mandaluyong City | SM Megamall, Ortigas, Mandaluyong City, Mandal... | 121.056475 | 14.585318 | Japanese, Sushi | ... | Botswana Pula(P) | No | No | No | No | 4 | 4.9 | Dark Green | Excellent | 365 |
| 4 | 6314302 | Sambo Kojin | 162 | Mandaluyong City | Third Floor, Mega Atrium, SM Megamall, Ortigas... | SM Megamall, Ortigas, Mandaluyong City | SM Megamall, Ortigas, Mandaluyong City, Mandal... | 121.057508 | 14.584450 | Japanese, Korean | ... | Botswana Pula(P) | Yes | No | No | No | 4 | 4.8 | Dark Green | Excellent | 229 |
5 rows × 21 columns
df.columns
Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
'Average Cost for two', 'Currency', 'Has Table booking',
'Has Online delivery', 'Is delivering now', 'Switch to order menu',
'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
'Votes'],
dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9551 entries, 0 to 9550 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Restaurant ID 9551 non-null int64 1 Restaurant Name 9551 non-null object 2 Country Code 9551 non-null int64 3 City 9551 non-null object 4 Address 9551 non-null object 5 Locality 9551 non-null object 6 Locality Verbose 9551 non-null object 7 Longitude 9551 non-null float64 8 Latitude 9551 non-null float64 9 Cuisines 9542 non-null object 10 Average Cost for two 9551 non-null int64 11 Currency 9551 non-null object 12 Has Table booking 9551 non-null object 13 Has Online delivery 9551 non-null object 14 Is delivering now 9551 non-null object 15 Switch to order menu 9551 non-null object 16 Price range 9551 non-null int64 17 Aggregate rating 9551 non-null float64 18 Rating color 9551 non-null object 19 Rating text 9551 non-null object 20 Votes 9551 non-null int64 dtypes: float64(3), int64(5), object(13) memory usage: 1.5+ MB
df.describe()
| Restaurant ID | Country Code | Longitude | Latitude | Average Cost for two | Price range | Aggregate rating | Votes | |
|---|---|---|---|---|---|---|---|---|
| count | 9.551000e+03 | 9551.000000 | 9551.000000 | 9551.000000 | 9551.000000 | 9551.000000 | 9551.000000 | 9551.000000 |
| mean | 9.051128e+06 | 18.365616 | 64.126574 | 25.854381 | 1199.210763 | 1.804837 | 2.666370 | 156.909748 |
| std | 8.791521e+06 | 56.750546 | 41.467058 | 11.007935 | 16121.183073 | 0.905609 | 1.516378 | 430.169145 |
| min | 5.300000e+01 | 1.000000 | -157.948486 | -41.330428 | 0.000000 | 1.000000 | 0.000000 | 0.000000 |
| 25% | 3.019625e+05 | 1.000000 | 77.081343 | 28.478713 | 250.000000 | 1.000000 | 2.500000 | 5.000000 |
| 50% | 6.004089e+06 | 1.000000 | 77.191964 | 28.570469 | 400.000000 | 2.000000 | 3.200000 | 31.000000 |
| 75% | 1.835229e+07 | 1.000000 | 77.282006 | 28.642758 | 700.000000 | 2.000000 | 3.700000 | 131.000000 |
| max | 1.850065e+07 | 216.000000 | 174.832089 | 55.976980 | 800000.000000 | 4.000000 | 4.900000 | 10934.000000 |
df.isnull().sum()
Restaurant ID 0 Restaurant Name 0 Country Code 0 City 0 Address 0 Locality 0 Locality Verbose 0 Longitude 0 Latitude 0 Cuisines 9 Average Cost for two 0 Currency 0 Has Table booking 0 Has Online delivery 0 Is delivering now 0 Switch to order menu 0 Price range 0 Aggregate rating 0 Rating color 0 Rating text 0 Votes 0 dtype: int64
[features for features in df.columns if df[features].isnull().sum()>0]
['Cuisines']
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
<AxesSubplot:>
df_country=pd.read_excel('Country-Code.xlsx')
df_country.head()
| Country Code | Country | |
|---|---|---|
| 0 | 1 | India |
| 1 | 14 | Australia |
| 2 | 30 | Brazil |
| 3 | 37 | Canada |
| 4 | 94 | Indonesia |
df.columns
Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
'Average Cost for two', 'Currency', 'Has Table booking',
'Has Online delivery', 'Is delivering now', 'Switch to order menu',
'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
'Votes'],
dtype='object')
#combine the dataset
final_df=pd.merge(df,df_country,on='Country Code',how='left')
final_df
| Restaurant ID | Restaurant Name | Country Code | City | Address | Locality | Locality Verbose | Longitude | Latitude | Cuisines | ... | Has Table booking | Has Online delivery | Is delivering now | Switch to order menu | Price range | Aggregate rating | Rating color | Rating text | Votes | Country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6317637 | Le Petit Souffle | 162 | Makati City | Third Floor, Century City Mall, Kalayaan Avenu... | Century City Mall, Poblacion, Makati City | Century City Mall, Poblacion, Makati City, Mak... | 121.027535 | 14.565443 | French, Japanese, Desserts | ... | Yes | No | No | No | 3 | 4.8 | Dark Green | Excellent | 314 | Phillipines |
| 1 | 6304287 | Izakaya Kikufuji | 162 | Makati City | Little Tokyo, 2277 Chino Roces Avenue, Legaspi... | Little Tokyo, Legaspi Village, Makati City | Little Tokyo, Legaspi Village, Makati City, Ma... | 121.014101 | 14.553708 | Japanese | ... | Yes | No | No | No | 3 | 4.5 | Dark Green | Excellent | 591 | Phillipines |
| 2 | 6300002 | Heat - Edsa Shangri-La | 162 | Mandaluyong City | Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal... | Edsa Shangri-La, Ortigas, Mandaluyong City | Edsa Shangri-La, Ortigas, Mandaluyong City, Ma... | 121.056831 | 14.581404 | Seafood, Asian, Filipino, Indian | ... | Yes | No | No | No | 4 | 4.4 | Green | Very Good | 270 | Phillipines |
| 3 | 6318506 | Ooma | 162 | Mandaluyong City | Third Floor, Mega Fashion Hall, SM Megamall, O... | SM Megamall, Ortigas, Mandaluyong City | SM Megamall, Ortigas, Mandaluyong City, Mandal... | 121.056475 | 14.585318 | Japanese, Sushi | ... | No | No | No | No | 4 | 4.9 | Dark Green | Excellent | 365 | Phillipines |
| 4 | 6314302 | Sambo Kojin | 162 | Mandaluyong City | Third Floor, Mega Atrium, SM Megamall, Ortigas... | SM Megamall, Ortigas, Mandaluyong City | SM Megamall, Ortigas, Mandaluyong City, Mandal... | 121.057508 | 14.584450 | Japanese, Korean | ... | Yes | No | No | No | 4 | 4.8 | Dark Green | Excellent | 229 | Phillipines |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9546 | 5915730 | NamlÛ± Gurme | 208 | ÛÁstanbul | Kemankeô Karamustafa Paôa Mahallesi, RÛ±htÛ±... | Karakí_y | Karakí_y, ÛÁstanbul | 28.977392 | 41.022793 | Turkish | ... | No | No | No | No | 3 | 4.1 | Green | Very Good | 788 | Turkey |
| 9547 | 5908749 | Ceviz AÛôacÛ± | 208 | ÛÁstanbul | Koôuyolu Mahallesi, Muhittin íìstí_ndaÛô Cadd... | Koôuyolu | Koôuyolu, ÛÁstanbul | 29.041297 | 41.009847 | World Cuisine, Patisserie, Cafe | ... | No | No | No | No | 3 | 4.2 | Green | Very Good | 1034 | Turkey |
| 9548 | 5915807 | Huqqa | 208 | ÛÁstanbul | Kuruí_eôme Mahallesi, Muallim Naci Caddesi, N... | Kuruí_eôme | Kuruí_eôme, ÛÁstanbul | 29.034640 | 41.055817 | Italian, World Cuisine | ... | No | No | No | No | 4 | 3.7 | Yellow | Good | 661 | Turkey |
| 9549 | 5916112 | Aôôk Kahve | 208 | ÛÁstanbul | Kuruí_eôme Mahallesi, Muallim Naci Caddesi, N... | Kuruí_eôme | Kuruí_eôme, ÛÁstanbul | 29.036019 | 41.057979 | Restaurant Cafe | ... | No | No | No | No | 4 | 4.0 | Green | Very Good | 901 | Turkey |
| 9550 | 5927402 | Walter's Coffee Roastery | 208 | ÛÁstanbul | CafeaÛôa Mahallesi, BademaltÛ± Sokak, No 21/B,... | Moda | Moda, ÛÁstanbul | 29.026016 | 40.984776 | Cafe | ... | No | No | No | No | 2 | 4.0 | Green | Very Good | 591 | Turkey |
9551 rows × 22 columns
##to check Data Types
final_df.dtypes
Restaurant ID int64 Restaurant Name object Country Code int64 City object Address object Locality object Locality Verbose object Longitude float64 Latitude float64 Cuisines object Average Cost for two int64 Currency object Has Table booking object Has Online delivery object Is delivering now object Switch to order menu object Price range int64 Aggregate rating float64 Rating color object Rating text object Votes int64 Country object dtype: object
country_names=final_df.Country.value_counts().index
country_val=final_df.Country.value_counts().values
##Pie Chart-top 3 countries that usese zomato
plt.pie(country_val[0:3],labels=country_names[0:3],autopct="%1.2f%%")
([<matplotlib.patches.Wedge at 0x1cd91dd71c0>, <matplotlib.patches.Wedge at 0x1cd91dd7850>, <matplotlib.patches.Wedge at 0x1cd91dd7f70>], [Text(-1.0829742700952103, 0.19278674827836725, 'India'), Text(1.077281715838356, -0.22240527134123297, 'United States'), Text(1.0995865153823035, -0.03015783794312073, 'United Kingdom')], [Text(-0.590713238233751, 0.10515640815183668, '94.39%'), Text(0.5876082086391032, -0.12131196618612707, '4.73%'), Text(0.5997744629358018, -0.01644972978715676, '0.87%')])
final_df.columns
Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
'Average Cost for two', 'Currency', 'Has Table booking',
'Has Online delivery', 'Is delivering now', 'Switch to order menu',
'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
'Votes', 'Country'],
dtype='object')
ratings=final_df.groupby(['Aggregate rating','Rating color','Rating text']).size().reset_index().rename(columns={0:'Rating Count'})
ratings
| Aggregate rating | Rating color | Rating text | Rating Count | |
|---|---|---|---|---|
| 0 | 0.0 | White | Not rated | 2148 |
| 1 | 1.8 | Red | Poor | 1 |
| 2 | 1.9 | Red | Poor | 2 |
| 3 | 2.0 | Red | Poor | 7 |
| 4 | 2.1 | Red | Poor | 15 |
| 5 | 2.2 | Red | Poor | 27 |
| 6 | 2.3 | Red | Poor | 47 |
| 7 | 2.4 | Red | Poor | 87 |
| 8 | 2.5 | Orange | Average | 110 |
| 9 | 2.6 | Orange | Average | 191 |
| 10 | 2.7 | Orange | Average | 250 |
| 11 | 2.8 | Orange | Average | 315 |
| 12 | 2.9 | Orange | Average | 381 |
| 13 | 3.0 | Orange | Average | 468 |
| 14 | 3.1 | Orange | Average | 519 |
| 15 | 3.2 | Orange | Average | 522 |
| 16 | 3.3 | Orange | Average | 483 |
| 17 | 3.4 | Orange | Average | 498 |
| 18 | 3.5 | Yellow | Good | 480 |
| 19 | 3.6 | Yellow | Good | 458 |
| 20 | 3.7 | Yellow | Good | 427 |
| 21 | 3.8 | Yellow | Good | 400 |
| 22 | 3.9 | Yellow | Good | 335 |
| 23 | 4.0 | Green | Very Good | 266 |
| 24 | 4.1 | Green | Very Good | 274 |
| 25 | 4.2 | Green | Very Good | 221 |
| 26 | 4.3 | Green | Very Good | 174 |
| 27 | 4.4 | Green | Very Good | 144 |
| 28 | 4.5 | Dark Green | Excellent | 95 |
| 29 | 4.6 | Dark Green | Excellent | 78 |
| 30 | 4.7 | Dark Green | Excellent | 42 |
| 31 | 4.8 | Dark Green | Excellent | 25 |
| 32 | 4.9 | Dark Green | Excellent | 61 |
ratings.head()
| Aggregate rating | Rating color | Rating text | Rating Count | |
|---|---|---|---|---|
| 0 | 0.0 | White | Not rated | 2148 |
| 1 | 1.8 | Red | Poor | 1 |
| 2 | 1.9 | Red | Poor | 2 |
| 3 | 2.0 | Red | Poor | 7 |
| 4 | 2.1 | Red | Poor | 15 |
import matplotlib
matplotlib.rcParams['figure.figsize']=(12,6)
sns.barplot(x='Aggregate rating',y='Rating Count',hue='Rating color',data=ratings,palette=['blue','red','orange','yellow','green','green'])
<AxesSubplot:xlabel='Aggregate rating', ylabel='Rating Count'>
##Count plot
sns.countplot(x='Rating color',data=ratings,palette=['blue','red','orange','yellow','green','green'])
<AxesSubplot:xlabel='Rating color', ylabel='count'>
##Find the counties name that has given 0 rating
final_df.groupby(['Aggregate rating','Country']).size().reset_index().head(5)
| Aggregate rating | Country | 0 | |
|---|---|---|---|
| 0 | 0.0 | Brazil | 5 |
| 1 | 0.0 | India | 2139 |
| 2 | 0.0 | United Kingdom | 1 |
| 3 | 0.0 | United States | 3 |
| 4 | 1.8 | India | 1 |
##find out which currency is used by which country
final_df[['Country','Currency']].groupby(['Country','Currency']).size().reset_index()
| Country | Currency | 0 | |
|---|---|---|---|
| 0 | Australia | Dollar($) | 24 |
| 1 | Brazil | Brazilian Real(R$) | 60 |
| 2 | Canada | Dollar($) | 4 |
| 3 | India | Indian Rupees(Rs.) | 8652 |
| 4 | Indonesia | Indonesian Rupiah(IDR) | 21 |
| 5 | New Zealand | NewZealand($) | 40 |
| 6 | Phillipines | Botswana Pula(P) | 22 |
| 7 | Qatar | Qatari Rial(QR) | 20 |
| 8 | Singapore | Dollar($) | 20 |
| 9 | South Africa | Rand(R) | 60 |
| 10 | Sri Lanka | Sri Lankan Rupee(LKR) | 20 |
| 11 | Turkey | Turkish Lira(TL) | 34 |
| 12 | UAE | Emirati Diram(AED) | 60 |
| 13 | United Kingdom | Pounds(£) | 80 |
| 14 | United States | Dollar($) | 434 |
##Which countries do have online deleviesrs option
final_df[final_df['Has Online delivery']=="Yes"].Country.value_counts()
India 2423 UAE 28 Name: Country, dtype: int64
##drop city category Feature
df.drop('City_Category',axis=1,inplace=True)
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) Input In [68], in <cell line: 2>() 1 ##drop city category Feature ----> 2 df.drop('City_Category',axis=1,inplace=True) File C:\ProgramData\Anaconda3\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs) 305 if len(args) > num_allow_args: 306 warnings.warn( 307 msg.format(arguments=arguments), 308 FutureWarning, 309 stacklevel=stacklevel, 310 ) --> 311 return func(*args, **kwargs) File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py:4954, in DataFrame.drop(self, labels, axis, index, columns, level, inplace, errors) 4806 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) 4807 def drop( 4808 self, (...) 4815 errors: str = "raise", 4816 ): 4817 """ 4818 Drop specified labels from rows or columns. 4819 (...) 4952 weight 1.0 0.8 4953 """ -> 4954 return super().drop( 4955 labels=labels, 4956 axis=axis, 4957 index=index, 4958 columns=columns, 4959 level=level, 4960 inplace=inplace, 4961 errors=errors, 4962 ) File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py:4267, in NDFrame.drop(self, labels, axis, index, columns, level, inplace, errors) 4265 for axis, labels in axes.items(): 4266 if labels is not None: -> 4267 obj = obj._drop_axis(labels, axis, level=level, errors=errors) 4269 if inplace: 4270 self._update_inplace(obj) File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py:4311, in NDFrame._drop_axis(self, labels, axis, level, errors, consolidate, only_slice) 4309 new_axis = axis.drop(labels, level=level, errors=errors) 4310 else: -> 4311 new_axis = axis.drop(labels, errors=errors) 4312 indexer = axis.get_indexer(new_axis) 4314 # Case for non-unique axis 4315 else: File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py:6644, in Index.drop(self, labels, errors) 6642 if mask.any(): 6643 if errors != "ignore": -> 6644 raise KeyError(f"{list(labels[mask])} not found in axis") 6645 indexer = indexer[~mask] 6646 return self.delete(indexer) KeyError: "['City_Category'] not found in axis"
df.columns
Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
'Average Cost for two', 'Currency', 'Has Table booking',
'Has Online delivery', 'Is delivering now', 'Switch to order menu',
'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
'Votes'],
dtype='object')
df_train=pd.read_csv('train.csv')
df_train
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 550063 | 1006033 | P00372445 | M | 51-55 | 13 | B | 1 | 1 | 20 | NaN | NaN | 368 |
| 550064 | 1006035 | P00375436 | F | 26-35 | 1 | C | 3 | 0 | 20 | NaN | NaN | 371 |
| 550065 | 1006036 | P00375436 | F | 26-35 | 15 | B | 4+ | 1 | 20 | NaN | NaN | 137 |
| 550066 | 1006038 | P00375436 | F | 55+ | 1 | C | 2 | 0 | 20 | NaN | NaN | 365 |
| 550067 | 1006039 | P00371644 | F | 46-50 | 0 | B | 4+ | 1 | 20 | NaN | NaN | 490 |
550068 rows × 12 columns
##import the test data
df_test=pd.read_csv('test.csv')
df_test
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000004 | P00128942 | M | 46-50 | 7 | B | 2 | 1 | 1 | 11.0 | NaN |
| 1 | 1000009 | P00113442 | M | 26-35 | 17 | C | 0 | 0 | 3 | 5.0 | NaN |
| 2 | 1000010 | P00288442 | F | 36-45 | 1 | B | 4+ | 1 | 5 | 14.0 | NaN |
| 3 | 1000010 | P00145342 | F | 36-45 | 1 | B | 4+ | 1 | 4 | 9.0 | NaN |
| 4 | 1000011 | P00053842 | F | 26-35 | 1 | C | 1 | 0 | 4 | 5.0 | 12.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 233594 | 1006036 | P00118942 | F | 26-35 | 15 | B | 4+ | 1 | 8 | NaN | NaN |
| 233595 | 1006036 | P00254642 | F | 26-35 | 15 | B | 4+ | 1 | 5 | 8.0 | NaN |
| 233596 | 1006036 | P00031842 | F | 26-35 | 15 | B | 4+ | 1 | 1 | 5.0 | 12.0 |
| 233597 | 1006037 | P00124742 | F | 46-50 | 1 | C | 4+ | 0 | 10 | 16.0 | NaN |
| 233598 | 1006039 | P00316642 | F | 46-50 | 0 | B | 4+ | 1 | 4 | 5.0 | NaN |
233599 rows × 11 columns
#merge both train and test data
df=df_train.append(df_test)
C:\Users\parsi\AppData\Local\Temp\ipykernel_7132\4047838133.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df=df_train.append(df_test)
df
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370.0 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200.0 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422.0 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057.0 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 233594 | 1006036 | P00118942 | F | 26-35 | 15 | B | 4+ | 1 | 8 | NaN | NaN | NaN |
| 233595 | 1006036 | P00254642 | F | 26-35 | 15 | B | 4+ | 1 | 5 | 8.0 | NaN | NaN |
| 233596 | 1006036 | P00031842 | F | 26-35 | 15 | B | 4+ | 1 | 1 | 5.0 | 12.0 | NaN |
| 233597 | 1006037 | P00124742 | F | 46-50 | 1 | C | 4+ | 0 | 10 | 16.0 | NaN | NaN |
| 233598 | 1006039 | P00316642 | F | 46-50 | 0 | B | 4+ | 1 | 4 | 5.0 | NaN | NaN |
783667 rows × 12 columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 783667 entries, 0 to 233598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 User_ID 783667 non-null int64 1 Product_ID 783667 non-null object 2 Gender 783667 non-null object 3 Age 783667 non-null object 4 Occupation 783667 non-null int64 5 City_Category 783667 non-null object 6 Stay_In_Current_City_Years 783667 non-null object 7 Marital_Status 783667 non-null int64 8 Product_Category_1 783667 non-null int64 9 Product_Category_2 537685 non-null float64 10 Product_Category_3 237858 non-null float64 11 Purchase 550068 non-null float64 dtypes: float64(3), int64(4), object(5) memory usage: 77.7+ MB
df.describe()
| User_ID | Occupation | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|
| count | 7.836670e+05 | 783667.000000 | 783667.000000 | 783667.000000 | 537685.000000 | 237858.000000 | 550068.000000 |
| mean | 1.003029e+06 | 8.079300 | 0.409777 | 5.366196 | 9.844506 | 12.668605 | 9263.968713 |
| std | 1.727267e+03 | 6.522206 | 0.491793 | 3.878160 | 5.089093 | 4.125510 | 5023.065394 |
| min | 1.000001e+06 | 0.000000 | 0.000000 | 1.000000 | 2.000000 | 3.000000 | 12.000000 |
| 25% | 1.001519e+06 | 2.000000 | 0.000000 | 1.000000 | 5.000000 | 9.000000 | 5823.000000 |
| 50% | 1.003075e+06 | 7.000000 | 0.000000 | 5.000000 | 9.000000 | 14.000000 | 8047.000000 |
| 75% | 1.004478e+06 | 14.000000 | 1.000000 | 8.000000 | 15.000000 | 16.000000 | 12054.000000 |
| max | 1.006040e+06 | 20.000000 | 1.000000 | 20.000000 | 18.000000 | 18.000000 | 23961.000000 |
df.drop(['User_ID'],axis=1,inplace=True)
df.head()
| Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370.0 |
| 1 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200.0 |
| 2 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422.0 |
| 3 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057.0 |
| 4 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969.0 |
#Convert Gender catagorical to numerical
df['Gender']=df['Gender'].map({'F':0,'M':1})
df
| Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | P00069042 | 0 | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370.0 |
| 1 | P00248942 | 0 | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200.0 |
| 2 | P00087842 | 0 | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422.0 |
| 3 | P00085442 | 0 | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057.0 |
| 4 | P00285442 | 1 | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 233594 | P00118942 | 0 | 26-35 | 15 | B | 4+ | 1 | 8 | NaN | NaN | NaN |
| 233595 | P00254642 | 0 | 26-35 | 15 | B | 4+ | 1 | 5 | 8.0 | NaN | NaN |
| 233596 | P00031842 | 0 | 26-35 | 15 | B | 4+ | 1 | 1 | 5.0 | 12.0 | NaN |
| 233597 | P00124742 | 0 | 46-50 | 1 | C | 4+ | 0 | 10 | 16.0 | NaN | NaN |
| 233598 | P00316642 | 0 | 46-50 | 0 | B | 4+ | 1 | 4 | 5.0 | NaN | NaN |
783667 rows × 11 columns
#HAndel age catageriocal feature to numnerical
df['Age'].unique()
array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
dtype=object)
df['Age']=df['Age'].map({'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':7})
df
| User_ID | Product_ID | Gender | Age | Occupation | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | B | C | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 1 | 10 | 2 | 0 | 3 | 8.0 | 16.0 | 8370.0 | 0 | 0 |
| 1 | 1000001 | P00248942 | F | 1 | 10 | 2 | 0 | 1 | 6.0 | 14.0 | 15200.0 | 0 | 0 |
| 2 | 1000001 | P00087842 | F | 1 | 10 | 2 | 0 | 12 | 8.0 | 16.0 | 1422.0 | 0 | 0 |
| 3 | 1000001 | P00085442 | F | 1 | 10 | 2 | 0 | 12 | 14.0 | 16.0 | 1057.0 | 0 | 0 |
| 4 | 1000002 | P00285442 | M | 7 | 16 | 4 | 0 | 8 | 8.0 | 16.0 | 7969.0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 233594 | 1006036 | P00118942 | F | 3 | 15 | 4 | 1 | 8 | 8.0 | 16.0 | NaN | 1 | 0 |
| 233595 | 1006036 | P00254642 | F | 3 | 15 | 4 | 1 | 5 | 8.0 | 16.0 | NaN | 1 | 0 |
| 233596 | 1006036 | P00031842 | F | 3 | 15 | 4 | 1 | 1 | 5.0 | 12.0 | NaN | 1 | 0 |
| 233597 | 1006037 | P00124742 | F | 5 | 1 | 4 | 0 | 10 | 16.0 | 16.0 | NaN | 0 | 1 |
| 233598 | 1006039 | P00316642 | F | 5 | 0 | 4 | 1 | 4 | 5.0 | 16.0 | NaN | 1 | 0 |
783667 rows × 13 columns
#Second Techique
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
df['Age']=label_encoder.fit_transform(df['Age'])
df['Age'].unique()
array([0, 6, 2, 4, 5, 3, 1], dtype=int64)
df
| Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | P00069042 | 0 | 0 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370.0 |
| 1 | P00248942 | 0 | 0 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200.0 |
| 2 | P00087842 | 0 | 0 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422.0 |
| 3 | P00085442 | 0 | 0 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057.0 |
| 4 | P00285442 | 1 | 6 | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 233594 | P00118942 | 0 | 2 | 15 | B | 4+ | 1 | 8 | NaN | NaN | NaN |
| 233595 | P00254642 | 0 | 2 | 15 | B | 4+ | 1 | 5 | 8.0 | NaN | NaN |
| 233596 | P00031842 | 0 | 2 | 15 | B | 4+ | 1 | 1 | 5.0 | 12.0 | NaN |
| 233597 | P00124742 | 0 | 4 | 1 | C | 4+ | 0 | 10 | 16.0 | NaN | NaN |
| 233598 | P00316642 | 0 | 4 | 0 | B | 4+ | 1 | 4 | 5.0 | NaN | NaN |
783667 rows × 11 columns
#fixing categorical City_categort
df_city=pd.get_dummies(df['City_Category'],drop_first=True)
df_city
| B | C | |
|---|---|---|
| 0 | 0 | 0 |
| 1 | 0 | 0 |
| 2 | 0 | 0 |
| 3 | 0 | 0 |
| 4 | 0 | 1 |
| ... | ... | ... |
| 233594 | 1 | 0 |
| 233595 | 1 | 0 |
| 233596 | 1 | 0 |
| 233597 | 0 | 1 |
| 233598 | 1 | 0 |
783667 rows × 2 columns
df=pd.concat([df,df_city],axis=1)
df.head()
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | B | C | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370.0 | 0 | 0 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200.0 | 0 | 0 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422.0 | 0 | 0 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057.0 | 0 | 0 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969.0 | 0 | 1 |
#drop City category
df.drop('City_Category',axis=1,inplace=True)
df.head()
| User_ID | Product_ID | Gender | Age | Occupation | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | B | C | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | 2 | 0 | 3 | NaN | NaN | 8370.0 | 0 | 0 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | 2 | 0 | 1 | 6.0 | 14.0 | 15200.0 | 0 | 0 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | 2 | 0 | 12 | NaN | NaN | 1422.0 | 0 | 0 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | 2 | 0 | 12 | 14.0 | NaN | 1057.0 | 0 | 0 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | 4+ | 0 | 8 | NaN | NaN | 7969.0 | 0 | 1 |
#Check missing values
df.isnull().sum()
User_ID 0 Product_ID 0 Gender 0 Age 0 Occupation 0 Stay_In_Current_City_Years 0 Marital_Status 0 Product_Category_1 0 Product_Category_2 245982 Product_Category_3 545809 Purchase 233599 B 0 C 0 dtype: int64
#Focus on replacing missing values
df['Product_Category_2'].unique()
array([nan, 6., 14., 2., 8., 15., 16., 11., 5., 3., 4., 12., 9.,
10., 17., 13., 7., 18.])
df['Product_Category_2'].value_counts()
8.0 91317 14.0 78834 2.0 70498 16.0 61687 15.0 54114 5.0 37165 4.0 36705 6.0 23575 11.0 20230 17.0 19104 13.0 15054 9.0 8177 12.0 7801 10.0 4420 3.0 4123 18.0 4027 7.0 854 Name: Product_Category_2, dtype: int64
##Replace the missing values with mode
df['Product_Category_2']=df['Product_Category_2'].fillna(df['Product_Category_2'].mode()[0])
df['Product_Category_2'].isnull().sum()
0
df['Product_Category_3']=df['Product_Category_3'].fillna(df['Product_Category_3'].mode()[0])
df['Product_Category_3'].isnull().sum()
0
df['Stay_In_Current_City_Years'].unique()
array(['2', '4+', '3', '1', '0'], dtype=object)
#replace 4+ with 4
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].str.replace('+', '')
C:\Users\parsi\AppData\Local\Temp\ipykernel_7132\2360179872.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].str.replace('+', '')
df.head()
| User_ID | Product_ID | Gender | Age | Occupation | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | B | C | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | 2 | 0 | 3 | 8.0 | 16.0 | 8370.0 | 0 | 0 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | 2 | 0 | 1 | 6.0 | 14.0 | 15200.0 | 0 | 0 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | 2 | 0 | 12 | 8.0 | 16.0 | 1422.0 | 0 | 0 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | 2 | 0 | 12 | 14.0 | 16.0 | 1057.0 | 0 | 0 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | 4 | 0 | 8 | 8.0 | 16.0 | 7969.0 | 0 | 1 |
#Convert object into intergers
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].astype(int)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 783667 entries, 0 to 233598 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 User_ID 783667 non-null int64 1 Product_ID 783667 non-null object 2 Gender 783667 non-null object 3 Age 783667 non-null object 4 Occupation 783667 non-null int64 5 Stay_In_Current_City_Years 783667 non-null int32 6 Marital_Status 783667 non-null int64 7 Product_Category_1 783667 non-null int64 8 Product_Category_2 783667 non-null float64 9 Product_Category_3 783667 non-null float64 10 Purchase 550068 non-null float64 11 B 783667 non-null uint8 12 C 783667 non-null uint8 dtypes: float64(3), int32(1), int64(4), object(3), uint8(2) memory usage: 70.3+ MB
df['B']=df['B'].astype(int)
df['C']=df['C'].astype(int)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 783667 entries, 0 to 233598 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 User_ID 783667 non-null int64 1 Product_ID 783667 non-null object 2 Gender 783667 non-null object 3 Age 783667 non-null object 4 Occupation 783667 non-null int64 5 Stay_In_Current_City_Years 783667 non-null int32 6 Marital_Status 783667 non-null int64 7 Product_Category_1 783667 non-null int64 8 Product_Category_2 783667 non-null float64 9 Product_Category_3 783667 non-null float64 10 Purchase 550068 non-null float64 11 B 783667 non-null int32 12 C 783667 non-null int32 dtypes: float64(3), int32(3), int64(4), object(3) memory usage: 74.7+ MB
#Visulisation
sns.barplot('Age','Purchase',hue="Gender",data=df)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='Age', ylabel='Purchase'>
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv('stud (2).csv')
df.head()
| gender | race_ethnicity | parental_level_of_education | lunch | test_preparation_course | math_score | reading_score | writing_score | |
|---|---|---|---|---|---|---|---|---|
| 0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
| 1 | female | group C | some college | standard | completed | 69 | 90 | 88 |
| 2 | female | group B | master's degree | standard | none | 90 | 95 | 93 |
| 3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 |
| 4 | male | group C | some college | standard | none | 76 | 78 | 75 |
df.shape
(1000, 8)
#missing values
df.isnull().sum()
gender 0 race_ethnicity 0 parental_level_of_education 0 lunch 0 test_preparation_course 0 math_score 0 reading_score 0 writing_score 0 dtype: int64
#check duplicated
df.duplicated().sum()
0
#Ckeck datatypes
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 1000 non-null object 1 race_ethnicity 1000 non-null object 2 parental_level_of_education 1000 non-null object 3 lunch 1000 non-null object 4 test_preparation_course 1000 non-null object 5 math_score 1000 non-null int64 6 reading_score 1000 non-null int64 7 writing_score 1000 non-null int64 dtypes: int64(3), object(5) memory usage: 62.6+ KB
#checking number of uniques values of each columns
df.nunique()
gender 2 race_ethnicity 5 parental_level_of_education 6 lunch 2 test_preparation_course 2 math_score 81 reading_score 72 writing_score 77 dtype: int64
#check the statistics of the dataset
df.describe()
| math_score | reading_score | writing_score | |
|---|---|---|---|
| count | 1000.00000 | 1000.000000 | 1000.000000 |
| mean | 66.08900 | 69.169000 | 68.054000 |
| std | 15.16308 | 14.600192 | 15.195657 |
| min | 0.00000 | 17.000000 | 10.000000 |
| 25% | 57.00000 | 59.000000 | 57.750000 |
| 50% | 66.00000 | 70.000000 | 69.000000 |
| 75% | 77.00000 | 79.000000 | 79.000000 |
| max | 100.00000 | 100.000000 | 100.000000 |
df.head()
| gender | race_ethnicity | parental_level_of_education | lunch | test_preparation_course | math_score | reading_score | writing_score | |
|---|---|---|---|---|---|---|---|---|
| 0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
| 1 | female | group C | some college | standard | completed | 69 | 90 | 88 |
| 2 | female | group B | master's degree | standard | none | 90 | 95 | 93 |
| 3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 |
| 4 | male | group C | some college | standard | none | 76 | 78 | 75 |
numerical_feature=[feature for feature in df.columns if df[feature].dtype!='O']
categorical_feature=[feature for feature in df.columns if df[feature].dtype=='O']
numerical_feature
['math_score', 'reading_score', 'writing_score']
categorical_feature
['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
#explore more visualization
fig,axis=plt.subplot(1,2,figsize=(15,7))
plt.subplot(121)
sns.histplot(data=df,x=)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
df=pd.read_excel('flight_price.xlsx')
df.head()
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 |
| 1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
| 2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 |
| 3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
| 4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
df.tail()
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 10678 | Air Asia | 9/04/2019 | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | non-stop | No info | 4107 |
| 10679 | Air India | 27/04/2019 | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | non-stop | No info | 4145 |
| 10680 | Jet Airways | 27/04/2019 | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | non-stop | No info | 7229 |
| 10681 | Vistara | 01/03/2019 | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | non-stop | No info | 12648 |
| 10682 | Air India | 9/05/2019 | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2 stops | No info | 11753 |
#get the basic info of data
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Date_of_Journey 10683 non-null object 2 Source 10683 non-null object 3 Destination 10683 non-null object 4 Route 10682 non-null object 5 Dep_Time 10683 non-null object 6 Arrival_Time 10683 non-null object 7 Duration 10683 non-null object 8 Total_Stops 10682 non-null object 9 Additional_Info 10683 non-null object 10 Price 10683 non-null int64 dtypes: int64(1), object(10) memory usage: 918.2+ KB
df.describe()
| Price | |
|---|---|
| count | 10683.000000 |
| mean | 9087.064121 |
| std | 4611.359167 |
| min | 1759.000000 |
| 25% | 5277.000000 |
| 50% | 8372.000000 |
| 75% | 12373.000000 |
| max | 79512.000000 |
df.head()
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 |
| 1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
| 2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 |
| 3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
| 4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
#feature engineering
#data_of_journey seperate date month and year
df['Date']=df['Date_of_Journey'].str.split('/').str[0]
df['Month']=df['Date_of_Journey'].str.split('/').str[1]
df['Year']=df['Date_of_Journey'].str.split('/').str[2]
df.head()
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Date | Month | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 | 24 | 03 | 2019 |
| 1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 | 1 | 05 | 2019 |
| 2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 | 9 | 06 | 2019 |
| 3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 | 12 | 05 | 2019 |
| 4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 | 01 | 03 | 2019 |
df.info() #see all the date month and year are still objects soo we should convert to numerical
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Date_of_Journey 10683 non-null object 2 Source 10683 non-null object 3 Destination 10683 non-null object 4 Route 10682 non-null object 5 Dep_Time 10683 non-null object 6 Arrival_Time 10683 non-null object 7 Duration 10683 non-null object 8 Total_Stops 10682 non-null object 9 Additional_Info 10683 non-null object 10 Price 10683 non-null int64 11 Date 10683 non-null object 12 Month 10683 non-null object 13 Year 10683 non-null object dtypes: int64(1), object(13) memory usage: 1.1+ MB
df['Date']=df['Date'].astype(int)#it convert object to interger
df['Month']=df['Month'].astype(int)
df['Year']=df['Year'].astype(int)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Date_of_Journey 10683 non-null object 2 Source 10683 non-null object 3 Destination 10683 non-null object 4 Route 10682 non-null object 5 Dep_Time 10683 non-null object 6 Arrival_Time 10683 non-null object 7 Duration 10683 non-null object 8 Total_Stops 10682 non-null object 9 Additional_Info 10683 non-null object 10 Price 10683 non-null int64 11 Date 10683 non-null int32 12 Month 10683 non-null int32 13 Year 10683 non-null int32 dtypes: int32(3), int64(1), object(10) memory usage: 1.0+ MB
##Drop data of journey
df.drop('Date_of_Journey',axis=1,inplace=True)
df['Arrival_Time']=df['Arrival_Time'].apply(lambda x:x.split(' ')[0])
##now time
df['Arrival_hour']=df['Arrival_Time'].str.split(':').str[0]
##now time
df['Arrival_min']=df['Arrival_Time'].str.split(':').str[1]
df.head()
| Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Date | Month | Year | Arrival_hour | Arrival_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 2019 | 01 | 10 |
| 1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 2019 | 13 | 15 |
| 2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 | 19h | 2 stops | No info | 13882 | 9 | 6 | 2019 | 04 | 25 |
| 3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 | 12 | 5 | 2019 | 23 | 30 |
| 4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 | 1 | 3 | 2019 | 21 | 35 |
df.head(2)
| Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Date | Month | Year | Arrival_hour | Arrival_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 2019 | 01 | 10 |
| 1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 2019 | 13 | 15 |
df['Arrival_hour']=df['Arrival_hour'].astype(int)
df['Arrival_min']=df['Arrival_min'].astype(int)
df.drop('Arrival_Time',axis=1,inplace=True)
df.head()
| Airline | Source | Destination | Route | Dep_Time | Duration | Total_Stops | Additional_Info | Price | Date | Month | Year | Arrival_hour | Arrival_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 2019 | 1 | 10 |
| 1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 2019 | 13 | 15 |
| 2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 19h | 2 stops | No info | 13882 | 9 | 6 | 2019 | 4 | 25 |
| 3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 5h 25m | 1 stop | No info | 6218 | 12 | 5 | 2019 | 23 | 30 |
| 4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 4h 45m | 1 stop | No info | 13302 | 1 | 3 | 2019 | 21 | 35 |
#dep_time
df['Depature_hour']=df['Dep_Time'].str.split(':').str[0]
df['Depature_min']=df['Dep_Time'].str.split(':').str[1]
df.head(2)
| Airline | Source | Destination | Route | Dep_Time | Duration | Total_Stops | Additional_Info | Price | Date | Month | Year | Arrival_hour | Arrival_min | Depature_hour | Depature_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 2019 | 1 | 10 | 22 | 20 |
| 1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 2019 | 13 | 15 | 05 | 50 |
df['Depature_hour']=df['Depature_hour'].astype(int)
df['Depature_min']=df['Depature_min'].astype(int)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Source 10683 non-null object 2 Destination 10683 non-null object 3 Route 10682 non-null object 4 Dep_Time 10683 non-null object 5 Duration 10683 non-null object 6 Total_Stops 10682 non-null object 7 Additional_Info 10683 non-null object 8 Price 10683 non-null int64 9 Date 10683 non-null int32 10 Month 10683 non-null int32 11 Year 10683 non-null int32 12 Arrival_hour 10683 non-null int32 13 Arrival_min 10683 non-null int32 14 Depature_hour 10683 non-null int32 15 Depature_min 10683 non-null int32 dtypes: int32(7), int64(1), object(8) memory usage: 1.0+ MB
df.drop('Dep_Time',axis=1,inplace=True)
df.head(2)
| Airline | Source | Destination | Route | Duration | Total_Stops | Additional_Info | Price | Date | Month | Year | Arrival_hour | Arrival_min | Depature_hour | Depature_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | BLR → DEL | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 2019 | 1 | 10 | 22 | 20 |
| 1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 2019 | 13 | 15 | 5 | 50 |
df['Total_Stops'].unique()
array(['non-stop', '2 stops', '1 stop', '3 stops', nan, '4 stops'],
dtype=object)
df[df['Total_Stops'].isnull()]
| Airline | Source | Destination | Route | Duration | Total_Stops | Additional_Info | Price | Date | Month | Year | Arrival_hour | Arrival_min | Depature_hour | Depature_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9039 | Air India | Delhi | Cochin | NaN | 23h 40m | NaN | No info | 7480 | 6 | 5 | 2019 | 9 | 25 | 9 | 45 |
#total stops that are converted to numerical
df['Total_Stops']=df['Total_Stops'].map({'non-stop':0,'1 stop':1,'2 stops':2,'3 stops':3,'4 stops':4,np.nan:1})
df[df['Total_Stops'].isnull()]
| Airline | Source | Destination | Route | Duration | Total_Stops | Additional_Info | Price | Date | Month | Year | Arrival_hour | Arrival_min | Depature_hour | Depature_min |
|---|
df.head()
| Airline | Source | Destination | Route | Duration | Total_Stops | Additional_Info | Price | Date | Month | Year | Arrival_hour | Arrival_min | Depature_hour | Depature_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | BLR → DEL | 2h 50m | 0 | No info | 3897 | 24 | 3 | 2019 | 1 | 10 | 22 | 20 |
| 1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 7h 25m | 2 | No info | 7662 | 1 | 5 | 2019 | 13 | 15 | 5 | 50 |
| 2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 19h | 2 | No info | 13882 | 9 | 6 | 2019 | 4 | 25 | 9 | 25 |
| 3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 5h 25m | 1 | No info | 6218 | 12 | 5 | 2019 | 23 | 30 | 18 | 5 |
| 4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 4h 45m | 1 | No info | 13302 | 1 | 3 | 2019 | 21 | 35 | 16 | 50 |
df['Duration'].str.split(' ').str[0].str.split('h').str[0]
0 2
1 7
2 19
3 5
4 4
..
10678 2
10679 2
10680 3
10681 2
10682 8
Name: Duration, Length: 10683, dtype: object
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
encoder.fit_transform(df[['Airline','Source','Destination']]).toarray()
array([[0., 0., 0., ..., 0., 0., 1.],
[0., 1., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 1.],
[0., 1., 0., ..., 0., 0., 0.]])
pd.DataFrame(encoder.fit_transform(df[['Airline','Source','Destination']]).toarray(),columns=encoder.get_feature_names_out())
| Airline_Air Asia | Airline_Air India | Airline_GoAir | Airline_IndiGo | Airline_Jet Airways | Airline_Jet Airways Business | Airline_Multiple carriers | Airline_Multiple carriers Premium economy | Airline_SpiceJet | Airline_Trujet | ... | Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | Destination_Banglore | Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10678 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 10679 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 10680 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 10681 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 10682 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
10683 rows × 23 columns
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv('https://raw.githubusercontent.com/krishnaik06/playstore-Dataset/main/googleplaystore.csv')
df.head()
| App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19M | 10,000+ | Free | 0 | Everyone | Art & Design | January 7, 2018 | 1.0.0 | 4.0.3 and up |
| 1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14M | 500,000+ | Free | 0 | Everyone | Art & Design;Pretend Play | January 15, 2018 | 2.0.0 | 4.0.3 and up |
| 2 | U Launcher Lite – FREE Live Cool Themes, Hide ... | ART_AND_DESIGN | 4.7 | 87510 | 8.7M | 5,000,000+ | Free | 0 | Everyone | Art & Design | August 1, 2018 | 1.2.4 | 4.0.3 and up |
| 3 | Sketch - Draw & Paint | ART_AND_DESIGN | 4.5 | 215644 | 25M | 50,000,000+ | Free | 0 | Teen | Art & Design | June 8, 2018 | Varies with device | 4.2 and up |
| 4 | Pixel Draw - Number Art Coloring Book | ART_AND_DESIGN | 4.3 | 967 | 2.8M | 100,000+ | Free | 0 | Everyone | Art & Design;Creativity | June 20, 2018 | 1.1 | 4.4 and up |
df.shape
(10841, 13)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10841 entries, 0 to 10840 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 App 10841 non-null object 1 Category 10841 non-null object 2 Rating 9367 non-null float64 3 Reviews 10841 non-null object 4 Size 10841 non-null object 5 Installs 10841 non-null object 6 Type 10840 non-null object 7 Price 10841 non-null object 8 Content Rating 10840 non-null object 9 Genres 10841 non-null object 10 Last Updated 10841 non-null object 11 Current Ver 10833 non-null object 12 Android Ver 10838 non-null object dtypes: float64(1), object(12) memory usage: 1.1+ MB
##Summary of the dataset
df.describe()
| Rating | |
|---|---|
| count | 9367.000000 |
| mean | 4.193338 |
| std | 0.537431 |
| min | 1.000000 |
| 25% | 4.000000 |
| 50% | 4.300000 |
| 75% | 4.500000 |
| max | 19.000000 |
df.isnull().sum()
App 0 Category 0 Rating 1474 Reviews 0 Size 0 Installs 0 Type 1 Price 0 Content Rating 1 Genres 0 Last Updated 0 Current Ver 8 Android Ver 3 dtype: int64
df.head(2)
| App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19M | 10,000+ | Free | 0 | Everyone | Art & Design | January 7, 2018 | 1.0.0 | 4.0.3 and up |
| 1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14M | 500,000+ | Free | 0 | Everyone | Art & Design;Pretend Play | January 15, 2018 | 2.0.0 | 4.0.3 and up |
#Check if all the values
df['Reviews'].unique()
array(['159', '967', '87510', ..., '603', '1195', '398307'], dtype=object)
df['Reviews'].str.isnumeric().sum()
10840
df[~df['Reviews'].str.isnumeric()]
| App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10472 | Life Made WI-Fi Touchscreen Photo Frame | 1.9 | 19.0 | 3.0M | 1,000+ | Free | 0 | Everyone | NaN | February 11, 2018 | 1.0.19 | 4.0 and up | NaN |
df_copy=df.copy()
df_copy=df_copy.drop(df_copy.index[10472])
##Convert Reviews Datatype to int
df_copy['Review']=df_copy['Review'].astype()
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py:3621, in Index.get_loc(self, key, method, tolerance) 3620 try: -> 3621 return self._engine.get_loc(casted_key) 3622 except KeyError as err: File C:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\index.pyx:136, in pandas._libs.index.IndexEngine.get_loc() File C:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\index.pyx:163, in pandas._libs.index.IndexEngine.get_loc() File pandas\_libs\hashtable_class_helper.pxi:5198, in pandas._libs.hashtable.PyObjectHashTable.get_item() File pandas\_libs\hashtable_class_helper.pxi:5206, in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'Review' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) Input In [97], in <cell line: 2>() 1 ##Convert Reviews Datatype to int ----> 2 df_copy['Review']=df_copy['Review'].astype() File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py:3505, in DataFrame.__getitem__(self, key) 3503 if self.columns.nlevels > 1: 3504 return self._getitem_multilevel(key) -> 3505 indexer = self.columns.get_loc(key) 3506 if is_integer(indexer): 3507 indexer = [indexer] File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py:3623, in Index.get_loc(self, key, method, tolerance) 3621 return self._engine.get_loc(casted_key) 3622 except KeyError as err: -> 3623 raise KeyError(key) from err 3624 except TypeError: 3625 # If we have a listlike key, _check_indexing_error will raise 3626 # InvalidIndexError. Otherwise we fall through and re-raise 3627 # the TypeError. 3628 self._check_indexing_error(key) KeyError: 'Review'
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
df=pd.read_csv('height-weight.csv')
df.head()
| Weight | Height | |
|---|---|---|
| 0 | 45 | 120 |
| 1 | 58 | 135 |
| 2 | 48 | 123 |
| 3 | 60 | 145 |
| 4 | 70 | 160 |
plt.scatter(df['Weight'],df['Height'])
plt.xlabel("Weight")
plt.ylabel("Height")
Text(0, 0.5, 'Height')
#divide our dataset into independent and dependent edatures
X=df[['Weight']] ##independent feature
y=df['Height'] ##dependent feature
##Train test palte
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)
X.shape
(23, 1)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
((18, 1), (5, 1), (18,), (5,))
##Standardize the dataset Train independent data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names warnings.warn(
plt.scatter(X_train,y_train)
<matplotlib.collections.PathCollection at 0x1bc4b6c68b0>
##Train model simple linear regression model
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train,y_train)
LinearRegression()
print("The slope or coefficient of weight is ",regressor.coef_)
print("Intercept",regressor.intercept_)
The slope or coefficient of weight is [17.03440872] Intercept 157.5
plt.scatter(X_train,y_train)
plt.plot(X_train,regressor.predict(X_train),'r')
[<matplotlib.lines.Line2D at 0x1bc4b89bd90>]
from sklearn.preprocessing import
Input In [9] from sklearn.preprocessing import ^ SyntaxError: invalid syntax
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
dataset=pd.read_csv("Algerian_forest_fires_dataset_UPDATE.csv")
dataset.head()
| Bejaia Region Dataset | |||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| day | month | year | Temperature | RH | Ws | Rain | FFMC | DMC | DC | ISI | BUI | FWI | Classes |
| 01 | 06 | 2012 | 29 | 57 | 18 | 0 | 65.7 | 3.4 | 7.6 | 1.3 | 3.4 | 0.5 | not fire |
| 02 | 06 | 2012 | 29 | 61 | 13 | 1.3 | 64.4 | 4.1 | 7.6 | 1 | 3.9 | 0.4 | not fire |
| 03 | 06 | 2012 | 26 | 82 | 22 | 13.1 | 47.1 | 2.5 | 7.1 | 0.3 | 2.7 | 0.1 | not fire |
| 04 | 06 | 2012 | 25 | 89 | 13 | 2.5 | 28.6 | 1.3 | 6.9 | 0 | 1.7 | 0 | not fire |
dataset.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 247 entries, ('day', 'month', 'year', 'Temperature', ' RH', ' Ws', 'Rain ', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI') to ('30', '09', '2012', '24', '64', '15', '0.2', '67.3', '3.8', '16.5', '1.2', '4.8', '0.5')
Data columns (total 1 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Bejaia Region Dataset 245 non-null object
dtypes: object(1)
memory usage: 49.3+ KB
##missing values
dataset[dataset.isnull().any(axis=1)]
| Bejaia Region Dataset | |||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Sidi-Bel Abbes Region Dataset | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 14 | 07 | 2012 | 37 | 37 | 18 | 0.2 | 88.9 | 12.9 | 14.6 9 | 12.5 | 10.4 | fire | NaN |
dataset.loc[:122,"Region"]=0
dataset.loc[122:,"Region"]=1
df=dataset
C:\Users\parsi\AppData\Local\Temp\ipykernel_24112\2651232788.py:1: FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead. dataset.loc[:122,"Region"]=0 C:\Users\parsi\AppData\Local\Temp\ipykernel_24112\2651232788.py:2: FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead. dataset.loc[122:,"Region"]=1
df.info
<bound method DataFrame.info of Bejaia Region Dataset \
day month year Temperature RH Ws Rain FFMC DMC DC ISI BUI FWI Classes
01 06 2012 29 57 18 0 65.7 3.4 7.6 1.3 3.4 0.5 not fire
02 06 2012 29 61 13 1.3 64.4 4.1 7.6 1 3.9 0.4 not fire
03 06 2012 26 82 22 13.1 47.1 2.5 7.1 0.3 2.7 0.1 not fire
04 06 2012 25 89 13 2.5 28.6 1.3 6.9 0 1.7 0 not fire
... ...
26 09 2012 30 65 14 0 85.4 16 44.5 4.5 16.9 6.5 fire
27 09 2012 28 87 15 4.4 41.1 6.5 8 0.1 6.2 0 not fire
28 09 2012 27 87 29 0.5 45.9 3.5 7.9 0.4 3.4 0.2 not fire
29 09 2012 24 54 18 0.1 79.7 4.3 15.2 1.7 5.1 0.7 not fire
30 09 2012 24 64 15 0.2 67.3 3.8 16.5 1.2 4.8 0.5 not fire
Region
day month year Temperature RH Ws Rain FFMC DMC DC ISI BUI FWI 0.0
01 06 2012 29 57 18 0 65.7 3.4 7.6 1.3 3.4 0.5 0.0
02 06 2012 29 61 13 1.3 64.4 4.1 7.6 1 3.9 0.4 0.0
03 06 2012 26 82 22 13.1 47.1 2.5 7.1 0.3 2.7 0.1 0.0
04 06 2012 25 89 13 2.5 28.6 1.3 6.9 0 1.7 0 0.0
... ...
26 09 2012 30 65 14 0 85.4 16 44.5 4.5 16.9 6.5 1.0
27 09 2012 28 87 15 4.4 41.1 6.5 8 0.1 6.2 0 1.0
28 09 2012 27 87 29 0.5 45.9 3.5 7.9 0.4 3.4 0.2 1.0
29 09 2012 24 54 18 0.1 79.7 4.3 15.2 1.7 5.1 0.7 1.0
30 09 2012 24 64 15 0.2 67.3 3.8 16.5 1.2 4.8 0.5 1.0
[247 rows x 2 columns]>
df[['Reigin']]=df[['Region']].astype(int)
df.head()
| Bejaia Region Dataset | Region | Reigin | |||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| day | month | year | Temperature | RH | Ws | Rain | FFMC | DMC | DC | ISI | BUI | FWI | Classes | 0.0 | 0 |
| 01 | 06 | 2012 | 29 | 57 | 18 | 0 | 65.7 | 3.4 | 7.6 | 1.3 | 3.4 | 0.5 | not fire | 0.0 | 0 |
| 02 | 06 | 2012 | 29 | 61 | 13 | 1.3 | 64.4 | 4.1 | 7.6 | 1 | 3.9 | 0.4 | not fire | 0.0 | 0 |
| 03 | 06 | 2012 | 26 | 82 | 22 | 13.1 | 47.1 | 2.5 | 7.1 | 0.3 | 2.7 | 0.1 | not fire | 0.0 | 0 |
| 04 | 06 | 2012 | 25 | 89 | 13 | 2.5 | 28.6 | 1.3 | 6.9 | 0 | 1.7 | 0 | not fire | 0.0 | 0 |
df.isnull().sum()
Bejaia Region Dataset 2 Region 0 Reigin 0 dtype: int64
df.iloc[[122]]
| Bejaia Region Dataset | |||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 30 | 09 | 2012 | 25 | 78 | 14 | 1.4 | 45 | 1.9 | 7.5 | 0.2 | 2.4 | 0.1 | not fire |
df=pd.read_csv('Algerian_forest_fires_cleaned_dataset.csv')
df.head()
| day | month | year | Temperature | RH | Ws | Rain | FFMC | DMC | DC | ISI | BUI | FWI | Classes | Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 6 | 2012 | 29 | 57 | 18 | 0.0 | 65.7 | 3.4 | 7.6 | 1.3 | 3.4 | 0.5 | not fire | 0 |
| 1 | 2 | 6 | 2012 | 29 | 61 | 13 | 1.3 | 64.4 | 4.1 | 7.6 | 1.0 | 3.9 | 0.4 | not fire | 0 |
| 2 | 3 | 6 | 2012 | 26 | 82 | 22 | 13.1 | 47.1 | 2.5 | 7.1 | 0.3 | 2.7 | 0.1 | not fire | 0 |
| 3 | 4 | 6 | 2012 | 25 | 89 | 13 | 2.5 | 28.6 | 1.3 | 6.9 | 0.0 | 1.7 | 0.0 | not fire | 0 |
| 4 | 5 | 6 | 2012 | 27 | 77 | 16 | 0.0 | 64.8 | 3.0 | 14.2 | 1.2 | 3.9 | 0.5 | not fire | 0 |
df.columns
Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes', 'Region'],
dtype='object')
#drop month,day,year
df.drop(['day','month','year'],axis=1,inplace=True)
df.head()
| Temperature | RH | Ws | Rain | FFMC | DMC | DC | ISI | BUI | FWI | Classes | Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 29 | 57 | 18 | 0.0 | 65.7 | 3.4 | 7.6 | 1.3 | 3.4 | 0.5 | not fire | 0 |
| 1 | 29 | 61 | 13 | 1.3 | 64.4 | 4.1 | 7.6 | 1.0 | 3.9 | 0.4 | not fire | 0 |
| 2 | 26 | 82 | 22 | 13.1 | 47.1 | 2.5 | 7.1 | 0.3 | 2.7 | 0.1 | not fire | 0 |
| 3 | 25 | 89 | 13 | 2.5 | 28.6 | 1.3 | 6.9 | 0.0 | 1.7 | 0.0 | not fire | 0 |
| 4 | 27 | 77 | 16 | 0.0 | 64.8 | 3.0 | 14.2 | 1.2 | 3.9 | 0.5 | not fire | 0 |
df['Classes'].value_counts()
fire 131 not fire 101 fire 4 fire 2 not fire 2 not fire 1 not fire 1 not fire 1 Name: Classes, dtype: int64
#Encoding
df["Classes"]=np.where(df['Classes'].str.contains("not fire"),0,1)
df.tail()
| Temperature | RH | Ws | Rain | FFMC | DMC | DC | ISI | BUI | FWI | Classes | Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 238 | 30 | 65 | 14 | 0.0 | 85.4 | 16.0 | 44.5 | 4.5 | 16.9 | 6.5 | 1 | 1 |
| 239 | 28 | 87 | 15 | 4.4 | 41.1 | 6.5 | 8.0 | 0.1 | 6.2 | 0.0 | 0 | 1 |
| 240 | 27 | 87 | 29 | 0.5 | 45.9 | 3.5 | 7.9 | 0.4 | 3.4 | 0.2 | 0 | 1 |
| 241 | 24 | 54 | 18 | 0.1 | 79.7 | 4.3 | 15.2 | 1.7 | 5.1 | 0.7 | 0 | 1 |
| 242 | 24 | 64 | 15 | 0.2 | 67.3 | 3.8 | 16.5 | 1.2 | 4.8 | 0.5 | 0 | 1 |
X_train.corr()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [45], in <cell line: 1>() ----> 1 X_train.corr() NameError: name 'X_train' is not defined
df['Classes'].value_counts()
1 137 0 106 Name: Classes, dtype: int64
#Independent and dependent features
X=df.drop('FWI',axis=1)
y=df['FWI']
X.head()
| Temperature | RH | Ws | Rain | FFMC | DMC | DC | ISI | BUI | Classes | Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 29 | 57 | 18 | 0.0 | 65.7 | 3.4 | 7.6 | 1.3 | 3.4 | 0 | 0 |
| 1 | 29 | 61 | 13 | 1.3 | 64.4 | 4.1 | 7.6 | 1.0 | 3.9 | 0 | 0 |
| 2 | 26 | 82 | 22 | 13.1 | 47.1 | 2.5 | 7.1 | 0.3 | 2.7 | 0 | 0 |
| 3 | 25 | 89 | 13 | 2.5 | 28.6 | 1.3 | 6.9 | 0.0 | 1.7 | 0 | 0 |
| 4 | 27 | 77 | 16 | 0.0 | 64.8 | 3.0 | 14.2 | 1.2 | 3.9 | 0 | 0 |
y
0 0.5
1 0.4
2 0.1
3 0.0
4 0.5
...
238 6.5
239 0.0
240 0.2
241 0.7
242 0.5
Name: FWI, Length: 243, dtype: float64
#Tain Test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)
X_train.shape,X_test.shape
((182, 11), (61, 11))
##Feature Selection based on correlation
X_train.corr()
| Temperature | RH | Ws | Rain | FFMC | DMC | DC | ISI | BUI | Classes | Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Temperature | 1.000000 | -0.656095 | -0.305977 | -0.317512 | 0.694768 | 0.498173 | 0.390684 | 0.629848 | 0.473609 | 0.542141 | 0.254549 |
| RH | -0.656095 | 1.000000 | 0.225736 | 0.241656 | -0.653023 | -0.414601 | -0.236078 | -0.717804 | -0.362317 | -0.456876 | -0.394665 |
| Ws | -0.305977 | 0.225736 | 1.000000 | 0.251932 | -0.190076 | 0.000379 | 0.096576 | -0.023558 | 0.035633 | -0.082570 | -0.199969 |
| Rain | -0.317512 | 0.241656 | 0.251932 | 1.000000 | -0.545491 | -0.289754 | -0.302341 | -0.345707 | -0.300964 | -0.369357 | -0.059022 |
| FFMC | 0.694768 | -0.653023 | -0.190076 | -0.545491 | 1.000000 | 0.620807 | 0.524101 | 0.750799 | 0.607210 | 0.781259 | 0.249514 |
| DMC | 0.498173 | -0.414601 | 0.000379 | -0.289754 | 0.620807 | 1.000000 | 0.868647 | 0.685656 | 0.983175 | 0.617273 | 0.212582 |
| DC | 0.390684 | -0.236078 | 0.096576 | -0.302341 | 0.524101 | 0.868647 | 1.000000 | 0.513701 | 0.942414 | 0.543581 | -0.060838 |
| ISI | 0.629848 | -0.717804 | -0.023558 | -0.345707 | 0.750799 | 0.685656 | 0.513701 | 1.000000 | 0.643818 | 0.742977 | 0.296441 |
| BUI | 0.473609 | -0.362317 | 0.035633 | -0.300964 | 0.607210 | 0.983175 | 0.942414 | 0.643818 | 1.000000 | 0.612239 | 0.114897 |
| Classes | 0.542141 | -0.456876 | -0.082570 | -0.369357 | 0.781259 | 0.617273 | 0.543581 | 0.742977 | 0.612239 | 1.000000 | 0.188837 |
| Region | 0.254549 | -0.394665 | -0.199969 | -0.059022 | 0.249514 | 0.212582 | -0.060838 | 0.296441 | 0.114897 | 0.188837 | 1.000000 |
#Check for multicolinearity
plt.figure(figsize=(12,10))
corr=X_train.corr()
sns.heatmap(corr,annot=True)
<AxesSubplot:>
def correlation(dataset,threshold):
col_corr=set()
corr_matrix=dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i,j]) > threshold:
colname=corr_martix.columns[i]
col_corr.add(colname)
return col_corr
##Threshold=Domain
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df=pd.read_csv('Algerian_forest_fires_cleaned_dataset.csv')
df.head()
| day | month | year | Temperature | RH | Ws | Rain | FFMC | DMC | DC | ISI | BUI | FWI | Classes | Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 6 | 2012 | 29 | 57 | 18 | 0.0 | 65.7 | 3.4 | 7.6 | 1.3 | 3.4 | 0.5 | not fire | 0 |
| 1 | 2 | 6 | 2012 | 29 | 61 | 13 | 1.3 | 64.4 | 4.1 | 7.6 | 1.0 | 3.9 | 0.4 | not fire | 0 |
| 2 | 3 | 6 | 2012 | 26 | 82 | 22 | 13.1 | 47.1 | 2.5 | 7.1 | 0.3 | 2.7 | 0.1 | not fire | 0 |
| 3 | 4 | 6 | 2012 | 25 | 89 | 13 | 2.5 | 28.6 | 1.3 | 6.9 | 0.0 | 1.7 | 0.0 | not fire | 0 |
| 4 | 5 | 6 | 2012 | 27 | 77 | 16 | 0.0 | 64.8 | 3.0 | 14.2 | 1.2 | 3.9 | 0.5 | not fire | 0 |
df.columns
Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes', 'Region'],
dtype='object')
#drop month,day and year
df.drop(['day','month','year'],axis=1,inplace=True)
df.head()
| Temperature | RH | Ws | Rain | FFMC | DMC | DC | ISI | BUI | FWI | Classes | Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 29 | 57 | 18 | 0.0 | 65.7 | 3.4 | 7.6 | 1.3 | 3.4 | 0.5 | not fire | 0 |
| 1 | 29 | 61 | 13 | 1.3 | 64.4 | 4.1 | 7.6 | 1.0 | 3.9 | 0.4 | not fire | 0 |
| 2 | 26 | 82 | 22 | 13.1 | 47.1 | 2.5 | 7.1 | 0.3 | 2.7 | 0.1 | not fire | 0 |
| 3 | 25 | 89 | 13 | 2.5 | 28.6 | 1.3 | 6.9 | 0.0 | 1.7 | 0.0 | not fire | 0 |
| 4 | 27 | 77 | 16 | 0.0 | 64.8 | 3.0 | 14.2 | 1.2 | 3.9 | 0.5 | not fire | 0 |
df['Classes'].value_counts()
fire 131 not fire 101 fire 4 fire 2 not fire 2 not fire 1 not fire 1 not fire 1 Name: Classes, dtype: int64
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
#Read a dataset
df=pd.read_csv('height-weight.csv')
df.head(2)
| Weight | Height | |
|---|---|---|
| 0 | 45 | 120 |
| 1 | 58 | 135 |
plt.scatter(df['Weight'],df['Height'])
plt.xlabel('Weight')
plt.ylabel('Height')
Text(0, 0.5, 'Height')
##Divide our dataset into independent and dependent feature
X=df[['Weight']] #Independent Feature
y=df['Height']#Dependent Feature
#Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42) #20 percent of dataset is used for test the data
X.shape
(23, 1)
X_train.shape,X_test.shape #see in the test data 20 percent of data is there
((18, 1), (5, 1))
#standardize the dataset Train independent data
from sklearn.preprocessing import StandardScaler # means using Z-score
scaler=StandardScaler()
X_train.head()
| Weight | |
|---|---|
| 12 | 105 |
| 1 | 58 |
| 13 | 100 |
| 5 | 78 |
| 2 | 48 |
X_train=scaler.fit_transform(X_train) #fit is used for mean and standardizion form
X_test=scaler.transform(X_test)
plt.scatter(X_train,y_train)
<matplotlib.collections.PathCollection at 0x2cfd36e7f40>
##rain the simple linear regression model
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train,y_train)
LinearRegression()
print('The slope or coefficient of weight is',regressor.coef_) #slope
print('Intercept',regressor.intercept_)
The slope or coefficient of weight is [17.03440872] Intercept 157.5
plt.scatter(X_train,y_train)
plt.plot(X_train,regressor.predict(X_train))
[<matplotlib.lines.Line2D at 0x2cfd43646a0>]
y_pred_test=regressor.predict(X_test)
y_pred_test,y_test
(array([161.08467086, 161.08467086, 129.3041561 , 177.45645118,
148.56507414]),
15 177
9 170
0 120
8 182
17 159
Name: Height, dtype: int64)
plt.scatter(X_test,y_test)
plt.plot(X_test,regressor.predict(X_test))
[<matplotlib.lines.Line2D at 0x2cfd7f62820>]
#Perfomance Matrix
##MSE,MAE,RMSC
from sklearn.metrics import mean_squared_error,mean_absolute_error
mse=mean_squared_error(y_test,y_pred_test)
mae=mean_absolute_error(y_test,y_pred_test)
rmse=np.sqrt(mse)
print(mse)
print(mae)
print(rmse)
109.77592599051654 9.822657814519227 10.477400726827076
from sklearn.metrics import r2_score
score=r2_score(y_test,y_pred_test)
score
0.7769869860423441
residuals=y_test-y_pred_test
residuals
15 15.915329 9 8.915329 0 -9.304156 8 4.543549 17 10.434926 Name: Height, dtype: float64
import seaborn as sns
sns.distplot(residuals,kde=True)
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='Height', ylabel='Density'>
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
california=fetch_california_housing()
type(california)
sklearn.utils.Bunch
california.keys()
dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])
print(california.DESCR)
.. _california_housing_dataset:
California Housing dataset
--------------------------
**Data Set Characteristics:**
:Number of Instances: 20640
:Number of Attributes: 8 numeric, predictive attributes and the target
:Attribute Information:
- MedInc median income in block group
- HouseAge median house age in block group
- AveRooms average number of rooms per household
- AveBedrms average number of bedrooms per household
- Population block group population
- AveOccup average number of household members
- Latitude block group latitude
- Longitude block group longitude
:Missing Attribute Values: None
This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html
The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).
This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bureau publishes sample data (a block group typically has a population
of 600 to 3,000 people).
An household is a group of people residing within a home. Since the average
number of rooms and bedrooms in this dataset are provided per household, these
columns may take surpinsingly large values for block groups with few households
and many empty houses, such as vacation resorts.
It can be downloaded/loaded using the
:func:`sklearn.datasets.fetch_california_housing` function.
.. topic:: References
- Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297
california.target_names
['MedHouseVal']
print(california.data)
[[ 8.3252 41. 6.98412698 ... 2.55555556
37.88 -122.23 ]
[ 8.3014 21. 6.23813708 ... 2.10984183
37.86 -122.22 ]
[ 7.2574 52. 8.28813559 ... 2.80225989
37.85 -122.24 ]
...
[ 1.7 17. 5.20554273 ... 2.3256351
39.43 -121.22 ]
[ 1.8672 18. 5.32951289 ... 2.12320917
39.43 -121.32 ]
[ 2.3886 16. 5.25471698 ... 2.61698113
39.37 -121.24 ]]
print(california.target)
[4.526 3.585 3.521 ... 0.923 0.847 0.894]
california.feature_names
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
##lets preprare the dataframe
dataset=pd.DataFrame(california.data,columns=california.feature_names)
dataset.head()
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | |
|---|---|---|---|---|---|---|---|---|
| 0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 |
| 1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 |
| 2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 |
| 3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 |
| 4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 |
dataset['Price']=california.target
dataset.head()
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | Price | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 | 4.526 |
| 1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 | 3.585 |
| 2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 | 3.521 |
| 3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 | 3.413 |
| 4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 | 3.422 |
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MedInc 20640 non-null float64 1 HouseAge 20640 non-null float64 2 AveRooms 20640 non-null float64 3 AveBedrms 20640 non-null float64 4 Population 20640 non-null float64 5 AveOccup 20640 non-null float64 6 Latitude 20640 non-null float64 7 Longitude 20640 non-null float64 8 Price 20640 non-null float64 dtypes: float64(9) memory usage: 1.4 MB
dataset.isnull().sum()
MedInc 0 HouseAge 0 AveRooms 0 AveBedrms 0 Population 0 AveOccup 0 Latitude 0 Longitude 0 Price 0 dtype: int64
import seaborn as sns
sns.pairplot(dataset)
<seaborn.axisgrid.PairGrid at 0x294992a7c70>
dataset.corr()
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | Price | |
|---|---|---|---|---|---|---|---|---|---|
| MedInc | 1.000000 | -0.119034 | 0.326895 | -0.062040 | 0.004834 | 0.018766 | -0.079809 | -0.015176 | 0.688075 |
| HouseAge | -0.119034 | 1.000000 | -0.153277 | -0.077747 | -0.296244 | 0.013191 | 0.011173 | -0.108197 | 0.105623 |
| AveRooms | 0.326895 | -0.153277 | 1.000000 | 0.847621 | -0.072213 | -0.004852 | 0.106389 | -0.027540 | 0.151948 |
| AveBedrms | -0.062040 | -0.077747 | 0.847621 | 1.000000 | -0.066197 | -0.006181 | 0.069721 | 0.013344 | -0.046701 |
| Population | 0.004834 | -0.296244 | -0.072213 | -0.066197 | 1.000000 | 0.069863 | -0.108785 | 0.099773 | -0.024650 |
| AveOccup | 0.018766 | 0.013191 | -0.004852 | -0.006181 | 0.069863 | 1.000000 | 0.002366 | 0.002476 | -0.023737 |
| Latitude | -0.079809 | 0.011173 | 0.106389 | 0.069721 | -0.108785 | 0.002366 | 1.000000 | -0.924664 | -0.144160 |
| Longitude | -0.015176 | -0.108197 | -0.027540 | 0.013344 | 0.099773 | 0.002476 | -0.924664 | 1.000000 | -0.045967 |
| Price | 0.688075 | 0.105623 | 0.151948 | -0.046701 | -0.024650 | -0.023737 | -0.144160 | -0.045967 | 1.000000 |
sns.heatmap(dataset.corr(),annot=True)
<AxesSubplot:>
dataset.head()
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | Price | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 | 4.526 |
| 1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 | 3.585 |
| 2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 | 3.521 |
| 3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 | 3.413 |
| 4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 | 3.422 |
#independent and dependent feature
X=dataset.iloc[:,:,-1] #independent features
y=dataset.iloc[:,-1] #dependent features
--------------------------------------------------------------------------- IndexingError Traceback (most recent call last) Input In [28], in <cell line: 2>() 1 #independent and dependent feature ----> 2 X=dataset.iloc[:,:,-1] #independent features 3 y=dataset.iloc[:,-1] File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:961, in _LocationIndexer.__getitem__(self, key) 959 if self._is_scalar_access(key): 960 return self.obj._get_value(*key, takeable=self._takeable) --> 961 return self._getitem_tuple(key) 962 else: 963 # we by definition only have the 0th axis 964 axis = self.axis or 0 File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:1458, in _iLocIndexer._getitem_tuple(self, tup) 1456 def _getitem_tuple(self, tup: tuple): -> 1458 tup = self._validate_tuple_indexer(tup) 1459 with suppress(IndexingError): 1460 return self._getitem_lowerdim(tup) File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:765, in _LocationIndexer._validate_tuple_indexer(self, key) 761 def _validate_tuple_indexer(self, key: tuple) -> tuple: 762 """ 763 Check the key for valid keys across my indexer. 764 """ --> 765 key = self._validate_key_length(key) 766 key = self._expand_ellipsis(key) 767 for i, k in enumerate(key): File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:812, in _LocationIndexer._validate_key_length(self, key) 810 raise IndexingError(_one_ellipsis_message) 811 return self._validate_key_length(key) --> 812 raise IndexingError("Too many indexers") 813 return key IndexingError: Too many indexers